Merge remote-tracking branch 'unstable/master'

This commit is contained in:
Geraldine Van der Auwera 2015-05-15 00:38:39 -04:00
commit 6c195ffcb1
1639 changed files with 106692 additions and 101253 deletions

View File

@ -1,6 +1,7 @@
#!/bin/sh
mvn_args="verify"
default_args="verify '-Ddisable.shadepackage'"
mvn_args="${default_args}"
mvn_properties=
mvn_clean=
unknown_args=
@ -44,22 +45,23 @@ for arg in "${@}" ; do
fi
else
if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then
if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "${default_args}" ]] ; then
echo "Sorry, this script does not currently support mixing targets." >&2
exit 1
elif [[ "${arg}" == "dist" ]] ; then
mvn_args="verify"
mvn_args="${default_args}"
elif [[ "${arg}" == "gatk" ]] ; then
mvn_args="verify '-P!queue'"
mvn_args="${default_args} '-P!queue'"
elif [[ "${arg}" == "test.compile" ]] ; then
mvn_args="test-compile"
elif [[ "${arg}" == "gatkdocs" ]] ; then
local_repo="sitetemprepo"
mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue"
mvn_args="install -Dmaven.repo.local=${local_repo} '-P!queue' && mvn site -Dmaven.repo.local=${local_repo} '-P!queue'"
mvn_pkg_args=
elif [[ "${arg}" == "package.gatk.full" ]] ; then
mvn_args="package '-P!private,!queue'"
@ -75,11 +77,11 @@ for arg in "${@}" ; do
# elif [[ "${arg}" == "release.gatk.full" ]] ; then
# mvn_args="package '-P!private,!queue'"
# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2"
# post_script=" && private/src/main/scripts/shell/copy_release.sh protected/gatk-package-distribution/target/GenomeAnalysisTK-*.tar.bz2"
# elif [[ "${arg}" == "release.queue.full" ]] ; then
# mvn_args="package '-P!private'"
# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2"
# post_script=" && private/src/main/scripts/shell/copy_release.sh protected/gatk-queue-package-distribution/target/Queue-*.tar.bz2"
elif [[ "${arg}" == "build-picard-private" ]] ; then
mvn_args="mvn install -f private/picard-maven/pom.xml"
@ -113,7 +115,7 @@ for arg in "${@}" ; do
mvn_args="${mvn_args} -Dgatk.queuetests.run=true"
elif [[ "${arg}" == "committests" ]] ; then
mvn_args="verify -Dgatk.committests.skipped=false"
mvn_args="${default_args} -Dgatk.committests.skipped=false"
elif [[ "${arg}" == "test" ]] ; then
mvn_args="test -Dgatk.unittests.skipped=false"
@ -122,19 +124,19 @@ for arg in "${@}" ; do
mvn_args="test -Dgatk.unittests.skipped=false"
elif [[ "${arg}" == "integrationtest" ]] ; then
mvn_args="verify -Dgatk.integrationtests.skipped=false"
mvn_args="${default_args} -Dgatk.integrationtests.skipped=false"
elif [[ "${arg}" == "largescaletest" ]] ; then
mvn_args="verify -Dgatk.largescaletests.skipped=false"
mvn_args="${default_args} -Dgatk.largescaletests.skipped=false"
elif [[ "${arg}" == "knowledgebasetest" ]] ; then
mvn_args="verify -Dgatk.knowledgebasetests.skipped=false"
mvn_args="${default_args} -Dgatk.knowledgebasetests.skipped=false"
elif [[ "${arg}" == "queuetest" ]] ; then
mvn_args="verify -Dgatk.queuetests.skipped=false"
mvn_args="${default_args} -Dgatk.queuetests.skipped=false"
elif [[ "${arg}" == "queuetestrun" ]] ; then
mvn_args="verify -Dgatk.queuetests.skipped=false -Dgatk.queuetests.run=true"
mvn_args="${default_args} -Dgatk.queuetests.skipped=false -Dgatk.queuetests.run=true"
elif [[ "${arg}" == "fasttest" ]] ; then
mvn_args="verify -Dgatk.committests.skipped=false -pl private/gatk-tools-private -am -Dresource.bundle.skip=true"

149
pom.xml
View File

@ -13,7 +13,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.3</version>
<version>3.4-SNAPSHOT</version>
<relativePath>public/gatk-root</relativePath>
</parent>
@ -32,11 +32,15 @@
<resource.bundle.skip>false</resource.bundle.skip>
<!-- TODO: Need a better a way to say "don't include hidden" by default -->
<gatkdocs.include.hidden>-build-timestamp "${maven.build.timestamp}"</gatkdocs.include.hidden>
<gatk.shell.directory>${gatk.basedir}/public/src/main/scripts/shell</gatk.shell.directory>
<gatk.assembly.directory>${gatk.basedir}/public/src/main/assembly</gatk.assembly.directory>
<!--
Phases of the build that may be disabled to speed up compilation.
-->
<gatk.jar.phase>package</gatk.jar.phase>
<gatk.unpack.phase>prepare-package</gatk.unpack.phase>
<gatk.shade.phase>package</gatk.shade.phase>
<gatk.generate-resources.phase>generate-resources</gatk.generate-resources.phase>
<gatk.process-resources.phase>process-resources</gatk.process-resources.phase>
<gatk.process-test-resources.phase>process-test-resources</gatk.process-test-resources.phase>
@ -65,6 +69,16 @@
<gatk.serialqueuetests.skipped>${gatk.serialcommittests.skipped}</gatk.serialqueuetests.skipped>
<gatk.seriallargescaletests.skipped>true</gatk.seriallargescaletests.skipped>
<gatk.serialknowledgebasetests.skipped>true</gatk.serialknowledgebasetests.skipped>
<!-- Full path to write the executable MANIFEST.MF only jars, 10 seconds to create at 4KB each, and the accompanying lib directory -->
<gatk.executable.directory>${gatk.basedir}/target/executable</gatk.executable.directory>
<!-- Full path to write (symlinks to) the full shaded package jars, 1+ minute to create at 12MB+ each -->
<gatk.package.directory>${gatk.basedir}/target/package</gatk.package.directory>
<!--
Full path to write symlink to either an executable MANIFEST.MF only jar - OR - a fully shaded package jar.
NOTE: MANIFEST.MF only jars MUST be accompanied by the lib folder, or they will not run.
-->
<gatk.shortcut.directory>${gatk.basedir}/target</gatk.shortcut.directory>
</properties>
<dependencies>
@ -138,9 +152,19 @@
<excludes>${resource.bundle.path}</excludes>
</configuration>
</execution>
<execution>
<id>executable-jar-lib</id>
<goals>
<goal>copy-dependencies</goal>
</goals>
<phase>none</phase>
<configuration>
<outputDirectory>${gatk.executable.directory}/lib</outputDirectory>
<includeScope>runtime</includeScope>
</configuration>
</execution>
</executions>
</plugin>
<!-- TODO: Change the ResourceBundleExtractorDoclet to not require log4j.properties file -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
@ -159,25 +183,6 @@
</goals>
<phase>${gatk.process-test-resources.phase}</phase>
</execution>
<execution>
<id>copy-resource-bundle-log4j</id>
<goals>
<goal>copy-resources</goal>
</goals>
<phase>none</phase>
<configuration>
<!--
Just before running the resource bundle generation, copy a simple log4j
config file to the apidocs execution directory, so that logging prints out.
-->
<outputDirectory>${project.reporting.outputDirectory}/apidocs</outputDirectory>
<resources>
<resource>
<directory>${gatk.basedir}/gatk-utils/src/main/config/org/broadinstitute/gatk/utils/help</directory>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
@ -198,8 +203,7 @@
<docletPath>${project.build.outputDirectory}</docletPath>
<docletArtifact>
<groupId>${project.groupId}</groupId>
<!-- TODO: THIS IS SUPPOSED TO BE GATK-UTILS! -->
<artifactId>gatk-tools-public</artifactId>
<artifactId>${project.artifactId}</artifactId>
<version>${project.version}</version>
</docletArtifact>
<maxmemory>2g</maxmemory>
@ -319,10 +323,40 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>executable-jar</id>
<goals>
<goal>jar</goal>
</goals>
<phase>none</phase>
<configuration>
<classesDirectory>${project.build.outputDirectory}/ignored_by_executable_jar</classesDirectory>
<outputDirectory>${gatk.executable.directory}</outputDirectory>
<finalName>${gatk.binary-dist.name}</finalName>
<archive>
<manifest>
<mainClass>${app.main.class}</mainClass>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
</manifest>
</archive>
</configuration>
</execution>
<execution>
<id>default-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
<!--
Maven keeps executing default-jar first, even if it's listed AFTER the executable-jar.
So while packaging: run executable-jar, disable default-jar, then run unshaded-default-jar.
-->
<execution>
<id>unshaded-default-jar</id>
<goals>
<goal>jar</goal>
</goals>
<phase>none</phase>
</execution>
<execution>
<id>test-jar</id>
<goals>
@ -341,13 +375,14 @@
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<id>gatk-executable</id>
<id>package-jar</id>
<goals>
<goal>shade</goal>
</goals>
<phase>none</phase>
<configuration>
<minimizeJar>true</minimizeJar>
<createDependencyReducedPom>false</createDependencyReducedPom>
<artifactSet>
<excludes>
<exclude>org.broadinstitute.gatk:gsalib:tar.gz:*</exclude>
@ -405,7 +440,7 @@
<phase>none</phase>
<configuration>
<descriptors>
<descriptor>src/main/assembly/binary-dist.xml</descriptor>
<descriptor>${gatk.assembly.directory}/binary-dist.xml</descriptor>
</descriptors>
</configuration>
</execution>
@ -437,7 +472,7 @@
</configuration>
</execution>
<execution>
<id>link-binary-jar</id>
<id>link-executable-jar</id>
<goals>
<goal>link</goal>
</goals>
@ -445,7 +480,26 @@
<configuration>
<links>
<link>
<dst>${gatk.basedir}/target/${gatk.binary-dist.name}.${project.packaging}</dst>
<dst>${gatk.shortcut.directory}/${gatk.binary-dist.name}.${project.packaging}</dst>
<src>${gatk.executable.directory}/${gatk.binary-dist.name}.${project.packaging}</src>
</link>
</links>
</configuration>
</execution>
<execution>
<id>link-package-jar</id>
<goals>
<goal>link</goal>
</goals>
<phase>none</phase>
<configuration>
<links>
<link>
<dst>${gatk.package.directory}/${gatk.binary-dist.name}.${project.packaging}</dst>
<src>${project.build.directory}/${project.build.finalName}.${project.packaging}</src>
</link>
<link>
<dst>${gatk.shortcut.directory}/${gatk.binary-dist.name}.${project.packaging}</dst>
<src>${project.build.directory}/${project.build.finalName}.${project.packaging}</src>
</link>
</links>
@ -624,6 +678,21 @@
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<!--
TODO: Separate maven modules into separate git repos?
Until then, keep devs from accidentally mixing utils/engine/tools.
-->
<id>check-utils-engine-tools</id>
<goals>
<goal>exec</goal>
</goals>
<phase>process-sources</phase>
<inherited>false</inherited>
<configuration>
<executable>${gatk.shell.directory}/check_utils_engine_tools.sh</executable>
</configuration>
</execution>
<execution>
<!--
TODO: Remove after 3.3+ release.
@ -637,7 +706,7 @@
<phase>process-test-resources</phase>
<inherited>false</inherited>
<configuration>
<executable>${gatk.basedir}/public/src/main/scripts/shell/delete_maven_links.sh</executable>
<executable>${gatk.shell.directory}/delete_maven_links.sh</executable>
</configuration>
</execution>
</executions>
@ -689,7 +758,7 @@
<!-- Only generate the GATK Docs across the parent aggregation, not the children too. -->
<inherited>false</inherited>
<configuration>
<doclet>org.broadinstitute.gatk.utils.help.GATKDoclet</doclet>
<doclet>org.broadinstitute.gatk.tools.walkers.help.WalkerDoclet</doclet>
<docletArtifact>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-package-distribution</artifactId>
@ -733,6 +802,26 @@
</modules>
</profile>
<!-- Optionally do not shade/package jars -->
<!--
NOTE: Profile id "fast" comes from comments in PR #771.
The name is meant to be memorable, but is highly non-specific. Users are forewarned that
behavior of this profile, or the identifier itself, may be heavily modified in the future.
Hardcode usage in non-VCS controlled scripts at your own risk.
-->
<profile>
<id>fast</id>
<activation>
<property>
<name>disable.shadepackage</name>
</property>
</activation>
<properties>
<gatk.unpack.phase>none</gatk.unpack.phase>
<gatk.shade.phase>none</gatk.shade.phase>
</properties>
</profile>
<!-- Collection of properties for use during package testing -->
<profile>
<id>packagetests-enabled</id>
@ -746,6 +835,8 @@
<maven.javadoc.skip>true</maven.javadoc.skip>
<gatk.generate-gatk-extensions.skipped>true</gatk.generate-gatk-extensions.skipped>
<gatk.jar.phase>none</gatk.jar.phase>
<gatk.unpack.phase>none</gatk.unpack.phase>
<gatk.shade.phase>none</gatk.shade.phase>
<gatk.generate-resources.phase>none</gatk.generate-resources.phase>
<gatk.process-resources.phase>none</gatk.process-resources.phase>
<gatk.process-test-resources.phase>none</gatk.process-test-resources.phase>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.3</version>
<version>3.4-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
@ -15,8 +15,6 @@
<properties>
<gatk.basedir>${project.basedir}/../..</gatk.basedir>
<gatk.unpack.phase>prepare-package</gatk.unpack.phase>
<gatk.shade.phase>package</gatk.shade.phase>
<app.main.class>org.broadinstitute.gatk.engine.CommandLineGATK</app.main.class>
<gatk.binary-dist.name>GenomeAnalysisTK</gatk.binary-dist.name>
</properties>
@ -43,9 +41,14 @@
<artifactId>gatk-tools-protected</artifactId>
<version>${project.version}</version>
</dependency>
<!-- slf4j bindings must only be at the package level: http://www.slf4j.org/manual.html -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</dependency>
<!-- Tribble codecs & the variant package (VCF, BCF, and VariantContext) -->
<dependency>
<groupId>samtools</groupId>
<groupId>com.github.samtools</groupId>
<artifactId>htsjdk</artifactId>
</dependency>
<!-- Workaround - depend on the logger impl required by JEXL -->
@ -73,7 +76,7 @@
<!-- Required for binary-dist assembly, excluded by shade -->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-engine</artifactId>
<artifactId>gatk-utils</artifactId>
<version>${project.version}</version>
<classifier>example-resources</classifier>
<type>tar.bz2</type>
@ -164,6 +167,25 @@
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>executable-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
<execution>
<id>default-jar</id>
<phase>none</phase>
</execution>
<execution>
<id>unshaded-default-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
@ -172,6 +194,10 @@
<id>unpack-direct-dependencies</id>
<phase>${gatk.unpack.phase}</phase>
</execution>
<execution>
<id>executable-jar-lib</id>
<phase>${gatk.jar.phase}</phase>
</execution>
</executions>
</plugin>
@ -180,7 +206,7 @@
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<id>gatk-executable</id>
<id>package-jar</id>
<phase>${gatk.shade.phase}</phase>
</execution>
</executions>
@ -202,7 +228,11 @@
<artifactId>maven-junction-plugin</artifactId>
<executions>
<execution>
<id>link-binary-jar</id>
<id>link-executable-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
<execution>
<id>link-package-jar</id>
<phase>${gatk.shade.phase}</phase>
</execution>
<execution>
@ -231,20 +261,6 @@
</build>
<profiles>
<profile>
<id>packagetests-enabled</id>
<activation>
<property>
<name>gatk.packagetests.enabled</name>
<value>true</value>
</property>
</activation>
<properties>
<gatk.jar.phase>none</gatk.jar.phase>
<gatk.unpack.phase>none</gatk.unpack.phase>
<gatk.shade.phase>none</gatk.shade.phase>
</properties>
</profile>
<profile>
<id>gsadev</id>
<activation>

View File

@ -1,22 +0,0 @@
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
<id>binary-dist</id>
<formats>
<format>tar.bz2</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<includes>
<include>org.broadinstitute.gatk:gatk-package-distribution</include>
</includes>
<outputFileNameMapping>${gatk.binary-dist.name}.${artifact.extension}</outputFileNameMapping>
</dependencySet>
<dependencySet>
<outputDirectory>resources</outputDirectory>
<unpack>true</unpack>
<includes>
<include>org.broadinstitute.gatk:gatk-engine:tar.bz2:example-resources</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.3</version>
<version>3.4-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
@ -41,6 +41,10 @@
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</dependency>
<dependency>
<groupId>com.github.broadinstitute</groupId>
<artifactId>picard</artifactId>
</dependency>
<!--
Extensions generator dependency only applies to the exec:exec,
not the artifact, but don't know another way to include
@ -66,7 +70,7 @@
-->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-tools-public</artifactId>
<artifactId>gatk-utils</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>

View File

@ -0,0 +1,76 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.queue.qscripts.examples
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
class ExampleFindCoveredIntervals extends QScript {
@Input(doc="The reference file for the bam files.", shortName="R")
var referenceFile: File = _
@Input(doc="Bam file to genotype.", shortName="I")
var bamFile: File = _
@Output(doc="Bam output", shortName="out")
var outFile: File = _
def script() {
val fci = new FindCoveredIntervals
fci.R = referenceFile
fci.memoryLimit = 2
fci.scatterCount = 3
fci.I :+= bamFile
fci.out = outFile
add(fci)
}
}

View File

@ -0,0 +1,80 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.queue.qscripts.examples
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
class ExampleHaplotypeCaller extends QScript {
@Input(doc="The reference file for the bam files.", shortName="R")
var referenceFile: File = _
@Input(doc="Bam file to genotype.", shortName="I")
var bamFile: File = _
@Output(doc="VCF output", shortName="out")
var outFile: File = _
@Argument(doc="One or more genomic intervals over which to operate", shortName="L", required=false)
var intervals: Seq[String] = Nil
def script() {
val hc = new HaplotypeCaller
hc.R = referenceFile
hc.memoryLimit = 2
hc.scatterCount = 3
hc.I :+= bamFile
hc.out = outFile
hc.intervalsString = intervals
add(hc)
}
}

View File

@ -91,7 +91,7 @@ class ExampleUnifiedGenotyperQueueTest {
" -R " + BaseTest.hg18Reference,
" -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam",
" -L " + intervalsPath).mkString
spec.jobRunners = Seq("Lsf706")
spec.jobRunners = Seq("GridEngine")
QueueTest.executeTest(spec)
}

View File

@ -0,0 +1,93 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.queue.pipeline.examples
import org.broadinstitute.gatk.queue.pipeline.{QueueTest, QueueTestSpec}
import org.broadinstitute.gatk.utils.BaseTest
import org.testng.annotations.Test
class UnmappedExcludedQueueTest {
@Test(timeOut=36000000)
def testUnmappedExclusion(): Unit = {
//FindCoveredIntervals is an ActiveRegionWalker, which throws an exception if it encounters unmapped reads
//But it's partitioned by contigs, which by default includes unmapped reads. Verify that the unmapped reads
//are correctly not added in this case
val testOut = "fci.out"
val spec = new QueueTestSpec
spec.name = "findcoveredintervals"
spec.args = Array(
" -S " + QueueTest.protectedQScriptsPackageDir + "examples/ExampleFindCoveredIntervals.scala",
" -R " + BaseTest.publicTestDir + "exampleFASTA.fasta",
" -I " + BaseTest.publicTestDir + "exampleBAM.bam",
" -out " + testOut).mkString
//The output file is blank - the real test is simply that it runs to completion
spec.fileMD5s += testOut -> "d41d8cd98f00b204e9800998ecf8427e"
QueueTest.executeTest(spec)
//Regression Test: HaplotypeCaller is also an ActiveRegionWalker, and is much more widely used. Explicitly test
//it as well
val hcTestOut = "hctest.vcf"
val hcSpec = new QueueTestSpec
hcSpec.name = "haplotypecaller"
hcSpec.args = Array(
" -S " + QueueTest.protectedQScriptsPackageDir + "examples/ExampleHaplotypeCaller.scala",
" -R " + BaseTest.publicTestDir + "exampleFASTA.fasta",
" -I " + BaseTest.publicTestDir + "exampleBAM.bam",
" -out " + hcTestOut).mkString
//The output file is blank - the real test is simply that it runs to completion
QueueTest.executeTest(hcSpec)
}
}

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.3</version>
<version>3.4-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
@ -15,8 +15,6 @@
<properties>
<gatk.basedir>${project.basedir}/../..</gatk.basedir>
<gatk.unpack.phase>prepare-package</gatk.unpack.phase>
<gatk.shade.phase>package</gatk.shade.phase>
<gatk.binary-dist.name>Queue</gatk.binary-dist.name>
<app.main.class>org.broadinstitute.gatk.queue.QCommandLine</app.main.class>
</properties>
@ -49,7 +47,7 @@
</dependency>
<!-- Picard -->
<dependency>
<groupId>picard</groupId>
<groupId>com.github.broadinstitute</groupId>
<artifactId>picard</artifactId>
</dependency>
<!-- JavaMail -->
@ -80,7 +78,7 @@
<!-- Required for binary-dist assembly, excluded by shade -->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-engine</artifactId>
<artifactId>gatk-utils</artifactId>
<version>${project.version}</version>
<classifier>example-resources</classifier>
<type>tar.bz2</type>
@ -171,6 +169,25 @@
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>executable-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
<execution>
<id>default-jar</id>
<phase>none</phase>
</execution>
<execution>
<id>unshaded-default-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
@ -179,6 +196,10 @@
<id>unpack-direct-dependencies</id>
<phase>${gatk.unpack.phase}</phase>
</execution>
<execution>
<id>executable-jar-lib</id>
<phase>${gatk.jar.phase}</phase>
</execution>
</executions>
</plugin>
@ -187,7 +208,7 @@
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<id>gatk-executable</id>
<id>package-jar</id>
<phase>${gatk.shade.phase}</phase>
</execution>
</executions>
@ -209,7 +230,11 @@
<artifactId>maven-junction-plugin</artifactId>
<executions>
<execution>
<id>link-binary-jar</id>
<id>link-executable-jar</id>
<phase>${gatk.jar.phase}</phase>
</execution>
<execution>
<id>link-package-jar</id>
<phase>${gatk.shade.phase}</phase>
</execution>
<execution>
@ -238,20 +263,6 @@
</build>
<profiles>
<profile>
<id>packagetests-enabled</id>
<activation>
<property>
<name>gatk.packagetests.enabled</name>
<value>true</value>
</property>
</activation>
<properties>
<gatk.jar.phase>none</gatk.jar.phase>
<gatk.unpack.phase>none</gatk.unpack.phase>
<gatk.shade.phase>none</gatk.shade.phase>
</properties>
</profile>
<profile>
<id>gsadev</id>
<activation>

View File

@ -1,23 +0,0 @@
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
<id>binary-dist</id>
<formats>
<format>tar.bz2</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<includes>
<include>org.broadinstitute.gatk:gatk-queue-package-distribution</include>
</includes>
<outputFileNameMapping>${gatk.binary-dist.name}.${artifact.extension}</outputFileNameMapping>
</dependencySet>
<dependencySet>
<outputDirectory>resources</outputDirectory>
<unpack>true</unpack>
<includes>
<include>org.broadinstitute.gatk:gatk-engine:tar.bz2:example-resources</include>
<include>org.broadinstitute.gatk:gatk-queue-extensions-public:tar.bz2:example-resources</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.3</version>
<version>3.4-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
@ -48,7 +48,15 @@
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-tools-public</artifactId>
<artifactId>gatk-utils</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>gatk-engine</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
@ -63,16 +71,6 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-resource-bundle-log4j</id>
<phase>prepare-package</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>

View File

@ -1,232 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.arguments;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode;
import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode;
import org.broadinstitute.gatk.utils.collections.DefaultHashMap;
import htsjdk.variant.variantcontext.VariantContext;
import java.io.File;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.Collections;
import java.util.Map;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 8/20/12
* A collection of arguments that are common to the various callers.
* This is pulled out so that every caller isn't exposed to the arguments from every other caller.
*/
public class StandardCallerArgumentCollection implements Cloneable {
@ArgumentCollection
public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
/**
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
*/
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
public RodBinding<VariantContext> alleles;
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
* will try to remove (N * contamination fraction) bases for each alternate allele.
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
/**
* This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
* Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION.
**/
@Advanced
@Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"<SampleID><TAB><Contamination>\" (Contamination is double) per line; No header.", required = false)
public File CONTAMINATION_FRACTION_FILE = null;
/**
* Indicates whether there is some sample contamination present.
*/
private boolean sampleContaminationWasLoaded = false;
/**
*
* @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION
*/
public Map<String,Double> getSampleContamination(){
//make sure that the default value is set up right
sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0)
sampleContaminationWasLoaded = true;
return Collections.unmodifiableMap(sampleContamination);
}
public void setSampleContamination(DefaultHashMap<String, Double> sampleContamination) {
this.sampleContamination.clear();
this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0;
if (!sampleContaminationWasLoaded)
for (final Double d : sampleContamination.values())
if (!Double.isNaN(d) && d > 0.0) {
sampleContaminationWasLoaded = true;
break;
}
this.sampleContamination.putAll(sampleContamination);
this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
}
/**
* Returns true if there is some sample contamination present, false otherwise.
* @return {@code true} iff there is some sample contamination
*/
public boolean isSampleContaminationPresent() {
return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded;
}
//Needs to be here because it uses CONTAMINATION_FRACTION
private DefaultHashMap<String,Double> sampleContamination = new DefaultHashMap<String,Double>(CONTAMINATION_FRACTION);
/**
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Hidden
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY;
/**
* Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
* This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
* WARNINGS:
* - This feature will inflate VCF file size considerably.
* - All SNP ALT alleles will be emitted with corresponding 10 PL values.
* - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
*/
@Advanced
@Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
public boolean annotateAllSitesWithPLs = false;
/**
* Creates a Standard caller argument collection with default values.
*/
public StandardCallerArgumentCollection() { }
/**
* "Casts" a caller argument collection into another type.
*
* <p>Common fields values are copied across</p>
* @param clazz the class of the result.
* @param <T> result argument collection class.
* @return never {@code null}.
*/
public <T extends StandardCallerArgumentCollection> T cloneTo(final Class<T> clazz) {
// short cut: just use regular clone if it happens to be the same class.
if (clazz == getClass())
return (T) clone();
try {
final T result = clazz.newInstance();
for (final Field field : getClass().getFields()) {
// just copy common fields.
if (!field.getDeclaringClass().isAssignableFrom(clazz))
continue;
final int fieldModifiers = field.getModifiers();
if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue;
//Use the clone() method if appropriate
if (Cloneable.class.isAssignableFrom(field.getType())) {
Method clone = field.getType().getMethod("clone");
field.set(result, clone.invoke(field.get(this)));
} else
field.set(result,field.get(this));
}
return result;
} catch (final Exception ex) {
throw new IllegalStateException(ex);
}
}
/**
* Creates a copy of this configuration.
* @return never {@code null}.
*/
@Override
public StandardCallerArgumentCollection clone() {
try {
StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone();
cloned.genotypeArgs = genotypeArgs.clone();
return cloned;
} catch (CloneNotSupportedException e) {
throw new IllegalStateException("unreachable code");
}
}
/**
* Holds a modifiers mask that identifies those fields that cannot be copied between
* StandardCallerArgumentCollections.
*/
private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL;
}

View File

@ -0,0 +1,138 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.commandline.Gatherer;
import org.broadinstitute.gatk.utils.report.GATKReport;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* User: carneiro
* Date: 3/29/11
*/
public class BQSRGatherer extends Gatherer {
private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
private static final String MISSING_OUTPUT_FILE = "missing output file name";
private static final String MISSING_READ_GROUPS = "Missing read group(s)";
@Override
public void gather(final List<File> inputs, final File output) {
final PrintStream outputFile;
try {
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
final GATKReport report = gatherReport(inputs);
report.print(outputFile);
}
/**
* Gathers the input recalibration reports into a single report.
*
* @param inputs Input recalibration GATK reports
* @return gathered recalibration GATK report
*/
public static GATKReport gatherReport(final List<File> inputs) {
final SortedSet<String> allReadGroups = new TreeSet<String>();
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
// Get the read groups from each input report
for (final File input : inputs) {
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
inputReadGroups.put(input, readGroups);
allReadGroups.addAll(readGroups);
}
// Log the read groups that are missing from specific inputs
for (Map.Entry<File, Set<String>> entry: inputReadGroups.entrySet()) {
final File input = entry.getKey();
final Set<String> readGroups = entry.getValue();
if (allReadGroups.size() != readGroups.size()) {
// Since this is not completely unexpected, more than debug, but less than a proper warning.
logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
logger.info(" " + readGroup);
}
}
}
RecalibrationReport generalReport = null;
for (File input : inputs) {
final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
if( inputReport.isEmpty() ) { continue; }
if (generalReport == null)
generalReport = inputReport;
else
generalReport.combine(inputReport);
}
if (generalReport == null)
throw new ReviewedGATKException(EMPTY_INPUT_LIST);
generalReport.calculateQuantizedQualities();
return generalReport.createGATKReport();
}
}

View File

@ -0,0 +1,104 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.WalkerManager;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/**
* A ReadTransformer that applies BQSR on the fly to reads
*
* User: rpoplin
* Date: 2/13/12
*/
public class BQSRReadTransformer extends ReadTransformer {
private boolean enabled;
private BaseRecalibration bqsr = null;
@Override
public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; }
@Override
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
this.enabled = engine.hasBQSRArgumentSet();
if ( enabled ) {
// TODO -- See important note below about applying BQSR to a reduced BAM file:
// If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file,
// we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource
// inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is
// a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here).
// Although we could add this check to the apply() method below, it's kind of ugly and inefficient.
// The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS);
final BQSRArgumentSet args = engine.getBQSRArgumentSet();
this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior());
}
final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class);
return mode.ApplicationTime();
}
@Override
public boolean enabled() {
return enabled;
}
/**
* initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads.
*/
@Override
public GATKSAMRecord apply(GATKSAMRecord read) {
bqsr.recalibrateRead(read);
return read;
}
}

View File

@ -0,0 +1,208 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import com.google.java.contract.Ensures;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.recalibration.EventType;
import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* Utility methods to facilitate on-the-fly base quality score recalibration.
*
* User: carneiro and rpoplin
* Date: 2/4/12
*/
public class BaseRecalibration {
private static Logger logger = Logger.getLogger(BaseRecalibration.class);
private final static boolean TEST_CACHING = false;
private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
private final RecalibrationTables recalibrationTables;
private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation
private final boolean disableIndelQuals;
private final int preserveQLessThan;
private final double globalQScorePrior;
private final boolean emitOriginalQuals;
/**
* Constructor using a GATK Report file
*
* @param RECAL_FILE a GATK Report file containing the recalibration information
* @param quantizationLevels number of bins to quantize the quality scores
* @param disableIndelQuals if true, do not emit base indel qualities
* @param preserveQLessThan preserve quality scores less than this value
*/
public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals, final double globalQScorePrior) {
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
recalibrationTables = recalibrationReport.getRecalibrationTables();
requestedCovariates = recalibrationReport.getRequestedCovariates();
quantizationInfo = recalibrationReport.getQuantizationInfo();
if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
quantizationInfo.noQuantization();
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report.
quantizationInfo.quantizeQualityScores(quantizationLevels);
this.disableIndelQuals = disableIndelQuals;
this.preserveQLessThan = preserveQLessThan;
this.globalQScorePrior = globalQScorePrior;
this.emitOriginalQuals = emitOriginalQuals;
}
/**
* Recalibrates the base qualities of a read
*
* It updates the base qualities of the read with the new recalibrated qualities (for all event types)
*
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* @param read the read to recalibrate
*/
public void recalibrateRead(final GATKSAMRecord read) {
if (emitOriginalQuals && read.getAttribute(SAMTag.OQ.name()) == null) { // Save the old qualities if the tag isn't already taken in the read
try {
read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities()));
} catch (IllegalArgumentException e) {
throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage());
}
}
final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates);
final int readLength = read.getReadLength();
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) {
read.setBaseQualities(null, errorModel);
continue;
}
final byte[] quals = read.getBaseQualities(errorModel);
// get the keyset for this base using the error model
final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel);
// the rg key is constant over the whole read, the global deltaQ is too
final int rgKey = fullReadKeySet[0][0];
final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal());
if( empiricalQualRG != null ) {
final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() );
for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read
final byte origQual = quals[offset];
// only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
if ( origQual >= preserveQLessThan ) {
// get the keyset for this base using the error model
final int[] keySet = fullReadKeySet[offset];
final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal());
final List<RecalDatum> empiricalQualCovs = new ArrayList<RecalDatum>();
for (int i = 2; i < requestedCovariates.length; i++) {
if (keySet[i] < 0) {
continue;
}
empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal()));
}
double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs );
// recalibrated quality is bound between 1 and MAX_QUAL
final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), RecalDatum.MAX_RECALIBRATED_Q_SCORE);
// return the quantized version of the recalibrated quality
final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual);
quals[offset] = recalibratedQualityScore;
}
}
}
// finally update the base qualities in the read
read.setBaseQualities(quals, errorModel);
}
}
@Ensures("result > 0.0")
protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List<RecalDatum> empiricalQualCovs ) {
final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon );
final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) );
double deltaQCovariates = 0.0;
for( final RecalDatum empiricalQualCov : empiricalQualCovs ) {
deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) );
}
return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates;
}
}

View File

@ -0,0 +1,500 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.report.GATKReport;
import org.broadinstitute.gatk.utils.report.GATKReportTable;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import java.io.PrintStream;
import java.util.*;
/**
* A general algorithm for quantizing quality score distributions to use a specific number of levels
*
* Takes a histogram of quality scores and a desired number of levels and produces a
* map from original quality scores -> quantized quality scores.
*
* Note that this data structure is fairly heavy-weight, holding lots of debugging and
* calculation information. If you want to use it efficiently at scale with lots of
* read groups the right way to do this:
*
* Map<ReadGroup, List<Byte>> map
* for each read group rg:
* hist = getQualHist(rg)
* QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual)
* map.set(rg, qq.getOriginalToQuantizedMap())
*
* This map would then be used to look up the appropriate original -> quantized
* quals for each read as it comes in.
*
* @author Mark Depristo
* @since 3/2/12
*/
public class QualQuantizer {
final private static Set<QualInterval> MY_EMPTY_SET = Collections.emptySet();
private static Logger logger = Logger.getLogger(QualQuantizer.class);
/**
* Inputs to the QualQuantizer
*/
final int nLevels, minInterestingQual;
final List<Long> nObservationsPerQual;
/**
* Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28).
*
* Has the same range as nObservationsPerQual
*/
final List<Byte> originalToQuantizedMap;
/** Sorted set of qual intervals.
*
* After quantize() this data structure contains only the top-level qual intervals
*/
final TreeSet<QualInterval> quantizedIntervals;
/**
* Protected creator for testng use only
*/
protected QualQuantizer(final int minInterestingQual) {
this.nObservationsPerQual = Collections.emptyList();
this.nLevels = 0;
this.minInterestingQual = minInterestingQual;
this.quantizedIntervals = null;
this.originalToQuantizedMap = null;
}
/**
* Creates a QualQuantizer for the histogram that has nLevels
*
* Note this is the only interface to the system. After creating this object
* the map can be obtained via getOriginalToQuantizedMap()
*
* @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that
* this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the
* way up to the largest quality score possible in the reads. OK if the histogram includes many 0
* count bins, as these are quantized for free.
* @param nLevels the desired number of distinct quality scores to represent the full original range. Must
* be at least 1.
* @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely
* merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and
* quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing
* all data with Q0-Q10.
*/
public QualQuantizer(final List<Long> nObservationsPerQual, final int nLevels, final int minInterestingQual) {
this.nObservationsPerQual = nObservationsPerQual;
this.nLevels = nLevels;
this.minInterestingQual = minInterestingQual;
// some sanity checking
if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedGATKException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual));
if ( nLevels < 0 ) throw new ReviewedGATKException("nLevels must be >= 0");
if ( minInterestingQual < 0 ) throw new ReviewedGATKException("minInterestingQual must be >= 0");
// actually run the quantizer
this.quantizedIntervals = quantize();
// store the map
this.originalToQuantizedMap = intervalsToMap(quantizedIntervals);
}
/**
* Represents an contiguous interval of quality scores.
*
* qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2
*/
@Invariant({
"qStart <= qEnd",
"qStart >= 0",
"qEnd <= 1000",
"nObservations >= 0",
"nErrors >= 0",
"nErrors <= nObservations",
"fixedQual >= -1 && fixedQual <= QualityUtils.MAX_SAM_QUAL_SCORE",
"mergeOrder >= 0"})
protected final class QualInterval implements Comparable<QualInterval> {
final int qStart, qEnd, fixedQual, level;
final long nObservations, nErrors;
final Set<QualInterval> subIntervals;
/** for debugging / visualization. When was this interval created? */
int mergeOrder;
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) {
this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET);
}
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set<QualInterval> subIntervals) {
this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals);
}
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) {
this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET);
}
@Requires("level >= 0")
public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set<QualInterval> subIntervals) {
this.qStart = qStart;
this.qEnd = qEnd;
this.nObservations = nObservations;
this.nErrors = nErrors;
this.fixedQual = fixedQual;
this.level = level;
this.mergeOrder = 0;
this.subIntervals = Collections.unmodifiableSet(subIntervals);
}
/**
* @return Human readable name of this interval: e.g., 10-12
*/
public String getName() {
return qStart + "-" + qEnd;
}
@Override
public String toString() {
return "QQ:" + getName();
}
/**
* @return the error rate (in real space) of this interval, or 0 if there are no observations
*/
@Ensures("result >= 0.0")
public double getErrorRate() {
if ( hasFixedQual() )
return QualityUtils.qualToErrorProb((byte)fixedQual);
else if ( nObservations == 0 )
return 0.0;
else
return (nErrors+1) / (1.0 * (nObservations+1));
}
/**
* @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual.
*/
@Ensures("result >= 0 && result <= QualityUtils.MAX_SAM_QUAL_SCORE")
public byte getQual() {
if ( ! hasFixedQual() )
return QualityUtils.errorProbToQual(getErrorRate());
else
return (byte)fixedQual;
}
/**
* @return true if this bin is using a fixed qual
*/
public boolean hasFixedQual() {
return fixedQual != -1;
}
@Override
public int compareTo(final QualInterval qualInterval) {
return Integer.valueOf(this.qStart).compareTo(qualInterval.qStart);
}
/**
* Create a interval representing the merge of this interval and toMerge
*
* Errors and observations are combined
* Subintervals updated in order of left to right (determined by qStart)
* Level is 1 + highest level of this and toMerge
* Order must be updated elsewhere
*
* @param toMerge
* @return newly created merged QualInterval
*/
@Requires({"toMerge != null"})
@Ensures({
"result != null",
"result.nObservations >= this.nObservations",
"result.nObservations >= toMerge.nObservations",
"result.nErrors >= this.nErrors",
"result.nErrors >= toMerge.nErrors",
"result.qStart == Math.min(this.qStart, toMerge.qStart)",
"result.qEnd == Math.max(this.qEnd, toMerge.qEnd)",
"result.level > Math.max(this.level, toMerge.level)",
"result.subIntervals.size() == 2"
})
public QualInterval merge(final QualInterval toMerge) {
final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge;
final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this;
if ( left.qEnd + 1 != right.qStart )
throw new ReviewedGATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right);
final long nCombinedObs = left.nObservations + right.nObservations;
final long nCombinedErr = left.nErrors + right.nErrors;
final int level = Math.max(left.level, right.level) + 1;
final Set<QualInterval> subIntervals = new HashSet<QualInterval>(Arrays.asList(left, right));
QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals);
return merged;
}
public double getPenalty() {
return calcPenalty(getErrorRate());
}
/**
* Calculate the penalty of this interval, given the overall error rate for the interval
*
* If the globalErrorRate is e, this value is:
*
* sum_i |log10(e_i) - log10(e)| * nObservations_i
*
* each the index i applies to all leaves of the tree accessible from this interval
* (found recursively from subIntervals as necessary)
*
* @param globalErrorRate overall error rate in real space against which we calculate the penalty
* @return the cost of approximating the bins in this interval with the globalErrorRate
*/
@Requires("globalErrorRate >= 0.0")
@Ensures("result >= 0.0")
private double calcPenalty(final double globalErrorRate) {
if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
return 0.0;
if ( subIntervals.isEmpty() ) {
// this is leave node
if ( this.qEnd <= minInterestingQual )
// It's free to merge up quality scores below the smallest interesting one
return 0;
else {
return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations;
}
} else {
double sum = 0;
for ( final QualInterval interval : subIntervals )
sum += interval.calcPenalty(globalErrorRate);
return sum;
}
}
}
/**
* Main method for computing the quantization intervals.
*
* Invoked in the constructor after all input variables are initialized. Walks
* over the inputs and builds the min. penalty forest of intervals with exactly nLevel
* root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed
* to find the optimal combination.
*
* TODO: develop a smarter algorithm
*
* @return the forest of intervals with size == nLevels
*/
@Ensures({"! result.isEmpty()", "result.size() == nLevels"})
private TreeSet<QualInterval> quantize() {
// create intervals for each qual individually
final TreeSet<QualInterval> intervals = new TreeSet<QualInterval>();
for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) {
final long nObs = nObservationsPerQual.get(qStart);
final double errorRate = QualityUtils.qualToErrorProb((byte)qStart);
final double nErrors = nObs * errorRate;
final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart);
intervals.add(qi);
}
// greedy algorithm:
// while ( n intervals >= nLevels ):
// find intervals to merge with least penalty
// merge it
while ( intervals.size() > nLevels ) {
mergeLowestPenaltyIntervals(intervals);
}
return intervals;
}
/**
* Helper function that finds and merges together the lowest penalty pair of intervals
* @param intervals
*/
@Requires("! intervals.isEmpty()")
private void mergeLowestPenaltyIntervals(final TreeSet<QualInterval> intervals) {
// setup the iterators
final Iterator<QualInterval> it1 = intervals.iterator();
final Iterator<QualInterval> it1p = intervals.iterator();
it1p.next(); // skip one
// walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty
QualInterval minMerge = null;
if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size());
int lastMergeOrder = 0;
while ( it1p.hasNext() ) {
final QualInterval left = it1.next();
final QualInterval right = it1p.next();
final QualInterval merged = left.merge(right);
lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder);
if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) {
if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge);
minMerge = merged;
}
}
// now actually go ahead and merge the minMerge pair
if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge);
intervals.removeAll(minMerge.subIntervals);
intervals.add(minMerge);
minMerge.mergeOrder = lastMergeOrder + 1;
if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals);
}
/**
* Given a final forest of intervals constructs a list mapping
* list.get(i) => quantized qual to use for original quality score i
*
* This function should be called only once to initialize the corresponding
* cached value in this object, as the calculation is a bit costly.
*
* @param intervals
* @return
*/
@Ensures("result.size() == getNQualsInHistogram()")
private List<Byte> intervalsToMap(final TreeSet<QualInterval> intervals) {
final List<Byte> map = new ArrayList<Byte>(getNQualsInHistogram());
map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE));
for ( final QualInterval interval : intervals ) {
for ( int q = interval.qStart; q <= interval.qEnd; q++ ) {
map.set(q, interval.getQual());
}
}
if ( Collections.min(map) == Byte.MIN_VALUE )
throw new ReviewedGATKException("quantized quality score map contains an un-initialized value");
return map;
}
@Ensures("result > 0")
private final int getNQualsInHistogram() {
return nObservationsPerQual.size();
}
/**
* Write out a GATKReport to visualize the QualQuantization process of this data
* @param out
*/
public void writeReport(PrintStream out) {
final GATKReport report = new GATKReport();
addQualHistogramToReport(report);
addIntervalsToReport(report);
report.print(out);
}
private final void addQualHistogramToReport(final GATKReport report) {
report.addTable("QualHistogram", "Quality score histogram provided to report", 2);
GATKReportTable table = report.getTable("QualHistogram");
table.addColumn("qual");
table.addColumn("count");
for ( int q = 0; q < nObservationsPerQual.size(); q++ ) {
table.set(q, "qual", q);
table.set(q, "count", nObservationsPerQual.get(q));
}
}
private final void addIntervalsToReport(final GATKReport report) {
report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals", 10);
GATKReportTable table = report.getTable("QualQuantizerIntervals");
table.addColumn("name");
table.addColumn("qStart");
table.addColumn("qEnd");
table.addColumn("level");
table.addColumn("merge.order");
table.addColumn("nErrors");
table.addColumn("nObservations");
table.addColumn("qual");
table.addColumn("penalty");
table.addColumn("root.node");
//table.addColumn("subintervals", "NA");
for ( QualInterval interval : quantizedIntervals )
addIntervalToReport(table, interval, true);
}
private final void addIntervalToReport(final GATKReportTable table, final QualInterval interval, final boolean atRootP) {
final String name = interval.getName();
table.set(name, "name", name);
table.set(name, "qStart", interval.qStart);
table.set(name, "qEnd", interval.qEnd);
table.set(name, "level", interval.level);
table.set(name, "merge.order", interval.mergeOrder);
table.set(name, "nErrors", interval.nErrors);
table.set(name, "nObservations", interval.nObservations);
table.set(name, "qual", interval.getQual());
table.set(name, "penalty", String.format("%.1f", interval.getPenalty()));
table.set(name, "root.node", atRootP);
for ( final QualInterval sub : interval.subIntervals )
addIntervalToReport(table, sub, false);
}
public List<Byte> getOriginalToQuantizedMap() {
return originalToQuantizedMap;
}
}

View File

@ -0,0 +1,151 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import org.broadinstitute.gatk.utils.report.GATKReportTable;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.collections.NestedIntegerArray;
import java.util.Arrays;
import java.util.List;
/**
* Class that encapsulates the information necessary for quality score quantization for BQSR
*
* @author carneiro
* @since 3/26/12
*/
public class QuantizationInfo {
private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts;
private int quantizationLevels;
private QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts;
this.quantizationLevels = quantizationLevels;
}
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
}
public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_SAM_QUAL_SCORE +1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table
for (final RecalDatum value : qualTable.getAllValues()) {
final RecalDatum datum = value;
final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL )
qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key
}
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(quantizationLevels);
this.quantizationLevels = quantizationLevels;
}
public void quantizeQualityScores(int nLevels) {
QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
}
public void noQuantization() {
this.quantizationLevels = QualityUtils.MAX_SAM_QUAL_SCORE;
for (int i = 0; i < this.quantizationLevels; i++)
quantizedQuals.set(i, (byte) i);
}
public List<Byte> getQuantizedQuals() {
return quantizedQuals;
}
public int getQuantizationLevels() {
return quantizationLevels;
}
public GATKReportTable generateReportTable(boolean sortByCols) {
GATKReportTable quantizedTable;
if(sortByCols) {
quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
} else {
quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3);
}
quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME);
quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME);
quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME);
for (int qual = 0; qual <= QualityUtils.MAX_SAM_QUAL_SCORE; qual++) {
quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual);
quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual));
quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual));
}
return quantizedTable;
}
private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
byte lastByte = -1;
int quantizationLevels = 0;
for (byte q : quantizedQuals) {
if (q != lastByte) {
quantizationLevels++;
lastByte = q;
}
}
return quantizationLevels;
}
}

View File

@ -0,0 +1,176 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.LRUCache;
import org.broadinstitute.gatk.utils.recalibration.EventType;
/**
* The object temporarily held by a read that describes all of it's covariates.
*
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class ReadCovariates {
private final static Logger logger = Logger.getLogger(ReadCovariates.class);
/**
* How big should we let the LRU cache grow
*/
private static final int LRU_CACHE_SIZE = 500;
/**
* Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen.
* The cache allows us to avoid the expense of recreating these arrays for every read. The LRU
* keeps the total number of cached arrays to less than LRU_CACHE_SIZE.
*
* This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE
*/
private final static ThreadLocal<LRUCache<Integer, int[][][]>> keysCache = new ThreadLocal<LRUCache<Integer, int[][][]>>() {
@Override protected LRUCache<Integer, int[][][]> initialValue() {
return new LRUCache<Integer, int[][][]>(LRU_CACHE_SIZE);
}
};
/**
* The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis.
* If running multiple analyses (or the unit test suite), it's necessary to clear the cache.
*/
public static void clearKeysCache() {
keysCache.remove();
}
/**
* Our keys, indexed by event type x read length x covariate
*/
private final int[][][] keys;
/**
* The index of the current covariate, used by addCovariate
*/
private int currentCovariateIndex = 0;
public ReadCovariates(final int readLength, final int numberOfCovariates) {
final LRUCache<Integer, int[][][]> cache = keysCache.get();
final int[][][] cachedKeys = cache.get(readLength);
if ( cachedKeys == null ) {
// There's no cached value for read length so we need to create a new int[][][] array
if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size());
keys = new int[EventType.values().length][readLength][numberOfCovariates];
cache.put(readLength, keys);
} else {
keys = cachedKeys;
}
}
public void setCovariateIndex(final int index) {
currentCovariateIndex = index;
}
/**
* Update the keys for mismatch, insertion, and deletion for the current covariate at read offset
*
* NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases
* after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently
* only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs.
*
* @param mismatch the mismatch key value
* @param insertion the insertion key value
* @param deletion the deletion key value
* @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates
*/
public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) {
keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch;
keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion;
keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion;
}
/**
* Get the keys for all covariates at read position for error model
*
* @param readPosition
* @param errorModel
* @return
*/
public int[] getKeySet(final int readPosition, final EventType errorModel) {
return keys[errorModel.ordinal()][readPosition];
}
public int[][] getKeySet(final EventType errorModel) {
return keys[errorModel.ordinal()];
}
// ----------------------------------------------------------------------
//
// routines for testing
//
// ----------------------------------------------------------------------
protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); }
protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); }
protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); }
protected int[] getMismatchesKeySet(final int readPosition) {
return getKeySet(readPosition, EventType.BASE_SUBSTITUTION);
}
protected int[] getInsertionsKeySet(final int readPosition) {
return getKeySet(readPosition, EventType.BASE_INSERTION);
}
protected int[] getDeletionsKeySet(final int readPosition) {
return getKeySet(readPosition, EventType.BASE_DELETION);
}
}

View File

@ -0,0 +1,434 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import htsjdk.samtools.SAMUtils;
import org.apache.commons.math.optimization.fitting.GaussianFunction;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
/**
* An individual piece of recalibration data. Each bin counts up the number of observations and the number
* of reference mismatches seen for that combination of covariates.
*
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*/
@Invariant({
"estimatedQReported >= 0.0",
"! Double.isNaN(estimatedQReported)",
"! Double.isInfinite(estimatedQReported)",
"empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED",
"! Double.isNaN(empiricalQuality)",
"! Double.isInfinite(empiricalQuality)",
"numObservations >= 0",
"numMismatches >= 0",
"numMismatches <= numObservations"
})
public class RecalDatum {
public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE;
private static final double UNINITIALIZED = -1.0;
/**
* estimated reported quality score based on combined data's individual q-reporteds and number of observations
*/
private double estimatedQReported;
/**
* the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
*/
private double empiricalQuality;
/**
* number of bases seen in total
*/
private long numObservations;
/**
* number of bases seen that didn't match the reference
*/
private double numMismatches;
/**
* used when calculating empirical qualities to avoid division by zero
*/
private static final int SMOOTHING_CONSTANT = 1;
//---------------------------------------------------------------------------------------------------------------
//
// constructors
//
//---------------------------------------------------------------------------------------------------------------
/**
* Create a new RecalDatum with given observation and mismatch counts, and an reported quality
*
* @param _numObservations observations
* @param _numMismatches mismatches
* @param reportedQuality Qreported
*/
public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) {
if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0");
if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0");
numObservations = _numObservations;
numMismatches = _numMismatches;
estimatedQReported = reportedQuality;
empiricalQuality = UNINITIALIZED;
}
/**
* Copy copy into this recal datum, overwriting all of this objects data
* @param copy RecalDatum to copy
*/
public RecalDatum(final RecalDatum copy) {
this.numObservations = copy.getNumObservations();
this.numMismatches = copy.getNumMismatches();
this.estimatedQReported = copy.estimatedQReported;
this.empiricalQuality = copy.empiricalQuality;
}
/**
* Add in all of the data from other into this object, updating the reported quality from the expected
* error rate implied by the two reported qualities
*
* @param other RecalDatum to combine
*/
public synchronized void combine(final RecalDatum other) {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
increment(other.getNumObservations(), other.getNumMismatches());
estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations());
empiricalQuality = UNINITIALIZED;
}
public synchronized void setEstimatedQReported(final double estimatedQReported) {
if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0");
if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite");
if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN");
this.estimatedQReported = estimatedQReported;
empiricalQuality = UNINITIALIZED;
}
public final double getEstimatedQReported() {
return estimatedQReported;
}
public final byte getEstimatedQReportedAsByte() {
return (byte)(int)(Math.round(getEstimatedQReported()));
}
//---------------------------------------------------------------------------------------------------------------
//
// Empirical quality score -- derived from the num mismatches and observations
//
//---------------------------------------------------------------------------------------------------------------
/**
* Returns the error rate (in real space) of this interval, or 0 if there are no observations
* @return the empirical error rate ~= N errors / N obs
*/
@Ensures({"result >= 0.0"})
public double getEmpiricalErrorRate() {
if ( numObservations == 0 )
return 0.0;
else {
// cache the value so we don't call log over and over again
final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT;
// smoothing is one error and one non-error observation, for example
final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
return doubleMismatches / doubleObservations;
}
}
public synchronized void setEmpiricalQuality(final double empiricalQuality) {
if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0");
if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite");
if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN");
this.empiricalQuality = empiricalQuality;
}
public final double getEmpiricalQuality() {
return getEmpiricalQuality(getEstimatedQReported());
}
public synchronized final double getEmpiricalQuality(final double conditionalPrior) {
if (empiricalQuality == UNINITIALIZED) {
calcEmpiricalQuality(conditionalPrior);
}
return empiricalQuality;
}
public final byte getEmpiricalQualityAsByte() {
return (byte)(Math.round(getEmpiricalQuality()));
}
//---------------------------------------------------------------------------------------------------------------
//
// toString methods
//
//---------------------------------------------------------------------------------------------------------------
@Override
public String toString() {
return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
}
public String stringForCSV() {
return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported());
}
//---------------------------------------------------------------------------------------------------------------
//
// increment methods
//
//---------------------------------------------------------------------------------------------------------------
public final long getNumObservations() {
return numObservations;
}
public final synchronized void setNumObservations(final long numObservations) {
if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
this.numObservations = numObservations;
empiricalQuality = UNINITIALIZED;
}
public final double getNumMismatches() {
return numMismatches;
}
@Requires({"numMismatches >= 0"})
public final synchronized void setNumMismatches(final double numMismatches) {
if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0");
this.numMismatches = numMismatches;
empiricalQuality = UNINITIALIZED;
}
@Requires({"by >= 0"})
public final synchronized void incrementNumObservations(final long by) {
numObservations += by;
empiricalQuality = UNINITIALIZED;
}
@Requires({"by >= 0"})
public final synchronized void incrementNumMismatches(final double by) {
numMismatches += by;
empiricalQuality = UNINITIALIZED;
}
@Requires({"incObservations >= 0", "incMismatches >= 0"})
@Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"})
public final synchronized void increment(final long incObservations, final double incMismatches) {
numObservations += incObservations;
numMismatches += incMismatches;
empiricalQuality = UNINITIALIZED;
}
@Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"})
public final synchronized void increment(final boolean isError) {
increment(1, isError ? 1.0 : 0.0);
}
// -------------------------------------------------------------------------------------
//
// Private implementation helper functions
//
// -------------------------------------------------------------------------------------
/**
* calculate the expected number of errors given the estimated Q reported and the number of observations
* in this datum.
*
* @return a positive (potentially fractional) estimate of the number of errors
*/
@Ensures("result >= 0.0")
private double calcExpectedErrors() {
return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported);
}
/**
* Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
*/
@Requires("empiricalQuality == UNINITIALIZED")
@Ensures("empiricalQuality != UNINITIALIZED")
private synchronized void calcEmpiricalQuality(final double conditionalPrior) {
// smoothing is one error and one non-error observation
final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT;
final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, conditionalPrior);
// This is the old and busted point estimate approach:
//final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
empiricalQuality = Math.min(empiricalQual, (double) MAX_RECALIBRATED_Q_SCORE);
}
//static final boolean DEBUG = false;
static private final double RESOLUTION_BINS_PER_QUAL = 1.0;
static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) {
final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL;
final double[] log10Posteriors = new double[numBins];
for ( int bin = 0; bin < numBins; bin++ ) {
final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL;
log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors);
//if ( DEBUG )
// System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
}
//if ( DEBUG )
// System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));
final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors);
final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors);
final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL;
return Qemp;
}
/**
* Quals above this value should be capped down to this value (because they are too high)
* in the base quality score recalibrator
*/
public final static byte MAX_GATK_USABLE_Q_SCORE = 40;
static private final double[] log10QempPriorCache = new double[MAX_GATK_USABLE_Q_SCORE + 1];
static {
// f(x) = a + b*exp(-((x - c)^2 / (2*d^2)))
// Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell".
final double GF_a = 0.0;
final double GF_b = 0.9;
final double GF_c = 0.0;
final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points
final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d);
for ( int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++ ) {
double log10Prior = Math.log10(gaussian.value((double) i));
if ( Double.isInfinite(log10Prior) )
log10Prior = -Double.MAX_VALUE;
log10QempPriorCache[i] = log10Prior;
}
}
static protected double log10QempPrior(final double Qempirical, final double Qreported) {
final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), MAX_GATK_USABLE_Q_SCORE);
//if ( DEBUG )
// System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
return log10QempPriorCache[difference];
}
static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1;
static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) {
if ( nObservations == 0 )
return 0.0;
// the binomial code requires ints as input (because it does caching). This should theoretically be fine because
// there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow
// before casting down to an int.
if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) {
// we need to decrease nErrors by the same fraction that we are decreasing nObservations
final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations;
nErrors = Math.round((double)nErrors * fraction);
nObservations = MAX_NUMBER_OF_OBSERVATIONS;
}
// this is just a straight binomial PDF
double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical));
if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) )
log10Prob = -Double.MAX_VALUE;
//if ( DEBUG )
// System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
return log10Prob;
}
}

View File

@ -0,0 +1,582 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.math.MathException;
import org.apache.commons.math.stat.inference.ChiSquareTestImpl;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
/**
* A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one
*
* @author Mark DePristo
* @since 07/27/12
*/
public class RecalDatumNode<T extends RecalDatum> {
private final static double SMALLEST_CHI2_PVALUE = 1e-300;
protected static final Logger logger = Logger.getLogger(RecalDatumNode.class);
/**
* fixedPenalty is this value if it's considered fixed
*/
private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY;
private final T recalDatum;
private double fixedPenalty = UNINITIALIZED;
private final Set<RecalDatumNode<T>> subnodes;
@Requires({"recalDatum != null"})
public RecalDatumNode(final T recalDatum) {
this(recalDatum, new HashSet<RecalDatumNode<T>>());
}
@Override
public String toString() {
return recalDatum.toString();
}
@Requires({"recalDatum != null", "subnodes != null"})
public RecalDatumNode(final T recalDatum, final Set<RecalDatumNode<T>> subnodes) {
this(recalDatum, UNINITIALIZED, subnodes);
}
@Requires({"recalDatum != null"})
protected RecalDatumNode(final T recalDatum, final double fixedPenalty) {
this(recalDatum, fixedPenalty, new HashSet<RecalDatumNode<T>>());
}
@Requires({"recalDatum != null", "subnodes != null"})
protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set<RecalDatumNode<T>> subnodes) {
this.recalDatum = recalDatum;
this.fixedPenalty = fixedPenalty;
this.subnodes = new HashSet<RecalDatumNode<T>>(subnodes);
}
/**
* Get the recal data associated with this node
* @return
*/
@Ensures("result != null")
public T getRecalDatum() {
return recalDatum;
}
/**
* The set of all subnodes of this tree. May be modified.
* @return
*/
@Ensures("result != null")
public Set<RecalDatumNode<T>> getSubnodes() {
return subnodes;
}
/**
* Return the fixed penalty, if set, or else the the calculated penalty for this node
* @return
*/
public double getPenalty() {
if ( fixedPenalty != UNINITIALIZED )
return fixedPenalty;
else
return calcPenalty();
}
/**
* Set the fixed penalty for this node to a fresh calculation from calcPenalty
*
* This is important in the case where you want to compute the penalty from a full
* tree and then chop the tree up afterwards while considering the previous penalties.
* If you don't call this function then manipulating the tree may result in the
* penalty functions changing with changes in the tree.
*
* @param doEntireTree recurse into all subnodes?
* @return the fixed penalty for this node
*/
public double calcAndSetFixedPenalty(final boolean doEntireTree) {
fixedPenalty = calcPenalty();
if ( doEntireTree )
for ( final RecalDatumNode<T> sub : subnodes )
sub.calcAndSetFixedPenalty(doEntireTree);
return fixedPenalty;
}
/**
* Add node to the set of subnodes of this node
* @param sub
*/
@Requires("sub != null")
public void addSubnode(final RecalDatumNode<T> sub) {
subnodes.add(sub);
}
/**
* Is this a leaf node (i.e., has no subnodes)?
* @return
*/
public boolean isLeaf() {
return subnodes.isEmpty();
}
/**
* Is this node immediately above only leaf nodes?
*
* @return
*/
public boolean isAboveOnlyLeaves() {
for ( final RecalDatumNode<T> sub : subnodes )
if ( ! sub.isLeaf() )
return false;
return true;
}
/**
* What's the immediate number of subnodes from this node?
* @return
*/
@Ensures("result >= 0")
public int getNumSubnodes() {
return subnodes.size();
}
/**
* Total penalty is the sum of leaf node penalties
*
* This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by
* definition have 0 penalty unless they represent a pruned tree with underlying -- but now
* pruned -- subtrees
*
* @return
*/
public double totalPenalty() {
if ( isLeaf() )
return getPenalty();
else {
double sum = 0.0;
for ( final RecalDatumNode<T> sub : subnodes )
sum += sub.totalPenalty();
return sum;
}
}
/**
* The maximum penalty among all nodes
* @return
*/
public double maxPenalty(final boolean leafOnly) {
double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE;
for ( final RecalDatumNode<T> sub : subnodes )
max = Math.max(max, sub.maxPenalty(leafOnly));
return max;
}
/**
* The minimum penalty among all nodes
* @return
*/
public double minPenalty(final boolean leafOnly) {
double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE;
for ( final RecalDatumNode<T> sub : subnodes )
min = Math.min(min, sub.minPenalty(leafOnly));
return min;
}
/**
* What's the longest branch from this node to any leaf?
* @return
*/
public int maxDepth() {
int subMax = 0;
for ( final RecalDatumNode<T> sub : subnodes )
subMax = Math.max(subMax, sub.maxDepth());
return subMax + 1;
}
/**
* What's the shortest branch from this node to any leaf? Includes this node
* @return
*/
@Ensures("result > 0")
public int minDepth() {
if ( isLeaf() )
return 1;
else {
int subMin = Integer.MAX_VALUE;
for ( final RecalDatumNode<T> sub : subnodes )
subMin = Math.min(subMin, sub.minDepth());
return subMin + 1;
}
}
/**
* Return the number of nodes, including this one, reachable from this node
* @return
*/
@Ensures("result > 0")
public int size() {
int size = 1;
for ( final RecalDatumNode<T> sub : subnodes )
size += sub.size();
return size;
}
/**
* Count the number of leaf nodes reachable from this node
*
* @return
*/
@Ensures("result >= 0")
public int numLeaves() {
if ( isLeaf() )
return 1;
else {
int size = 0;
for ( final RecalDatumNode<T> sub : subnodes )
size += sub.numLeaves();
return size;
}
}
/**
* Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node.
*
* The chi^2 value indicates the degree of independence of the implied error rates among the
* immediate subnodes
*
* @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated
*/
private double calcPenalty() {
if ( isLeaf() || freeToMerge() )
return 0.0;
else if ( subnodes.size() == 1 )
// only one value, so its free to merge away
return 0.0;
else {
final long[][] counts = new long[subnodes.size()][2];
int i = 0;
for ( final RecalDatumNode<T> subnode : subnodes ) {
// use the yates correction to help avoid all zeros => NaN
counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L;
counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L;
i++;
}
try {
final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts);
final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE));
// make sure things are reasonable and fail early if not
if (Double.isInfinite(penalty) || Double.isNaN(penalty))
throw new ReviewedGATKException("chi2 value is " + chi2PValue + " at " + getRecalDatum());
return penalty;
} catch ( MathException e ) {
throw new ReviewedGATKException("Failed in calculating chi2 value", e);
}
}
}
/**
* Is this node free to merge because its rounded Q score is the same as all nodes below
* @return
*/
private boolean freeToMerge() {
if ( isLeaf() ) // leaves are free to merge
return true;
else {
final byte myQual = getRecalDatum().getEmpiricalQualityAsByte();
for ( final RecalDatumNode<T> sub : subnodes )
if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual )
return false;
return true;
}
}
/**
* Calculate the penalty of this interval, given the overall error rate for the interval
*
* If the globalErrorRate is e, this value is:
*
* sum_i |log10(e_i) - log10(e)| * nObservations_i
*
* each the index i applies to all leaves of the tree accessible from this interval
* (found recursively from subnodes as necessary)
*
* @param globalErrorRate overall error rate in real space against which we calculate the penalty
* @return the cost of approximating the bins in this interval with the globalErrorRate
*/
@Requires("globalErrorRate >= 0.0")
@Ensures("result >= 0.0")
private double calcPenaltyLog10(final double globalErrorRate) {
if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
return 0.0;
if ( isLeaf() ) {
// this is leave node
return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations();
// TODO -- how we can generalize this calculation?
// if ( this.qEnd <= minInterestingQual )
// // It's free to merge up quality scores below the smallest interesting one
// return 0;
// else {
// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
// }
} else {
double sum = 0;
for ( final RecalDatumNode<T> hrd : subnodes)
sum += hrd.calcPenaltyLog10(globalErrorRate);
return sum;
}
}
/**
* Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf
*
* @param maxDepth
* @return
*/
public RecalDatumNode<T> pruneToDepth(final int maxDepth) {
if ( maxDepth < 1 )
throw new IllegalArgumentException("maxDepth < 1");
else {
final Set<RecalDatumNode<T>> subPruned = new HashSet<RecalDatumNode<T>>(getNumSubnodes());
if ( maxDepth > 1 )
for ( final RecalDatumNode<T> sub : subnodes )
subPruned.add(sub.pruneToDepth(maxDepth - 1));
return new RecalDatumNode<T>(getRecalDatum(), fixedPenalty, subPruned);
}
}
/**
* Return a freshly allocated tree with to no more than maxElements in order of penalty
*
* Note that nodes must have fixed penalties to this algorithm will fail.
*
* @param maxElements
* @return
*/
public RecalDatumNode<T> pruneByPenalty(final int maxElements) {
RecalDatumNode<T> root = this;
while ( root.size() > maxElements ) {
// remove the lowest penalty element, and continue
root = root.removeLowestPenaltyNode();
}
// our size is below the target, so we are good, return
return root;
}
/**
* Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged
*
* Note that nodes must have fixed penalties to this algorithm will fail.
*
* @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge
* @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction
* for the size of the initial tree. That is, if there are 10 nodes in the
* tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30
* penalty for multiple testing
* @return
*/
public RecalDatumNode<T> pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) {
RecalDatumNode<T> root = this;
final double bonferroniCorrection = 10 * Math.log10(this.size());
final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn;
if ( applyBonferroniCorrection )
logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " +
"corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty));
while ( true ) {
final Pair<RecalDatumNode<T>, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode();
if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) {
// nothing to merge, or the best candidate is above our max allowed
if ( minPenaltyNode == null ) {
if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found");
} else {
if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty);
}
break;
} else {
// remove the lowest penalty element, and continue
if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond());
root = root.removeLowestPenaltyNode();
}
}
// no more candidates exist with penalty < maxPenalty
return root;
}
/**
* Find the lowest penalty above leaf node in the tree, and return a tree without it
*
* Note this excludes the current (root) node
*
* @return
*/
private RecalDatumNode<T> removeLowestPenaltyNode() {
final Pair<RecalDatumNode<T>, Double> nodeToRemove = getMinPenaltyAboveLeafNode();
if ( logger.isDebugEnabled() )
logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
final Pair<RecalDatumNode<T>, Boolean> result = removeNode(nodeToRemove.getFirst());
if ( ! result.getSecond() )
throw new IllegalStateException("Never removed any node!");
final RecalDatumNode<T> oneRemoved = result.getFirst();
if ( oneRemoved == null )
throw new IllegalStateException("Removed our root node, wow, didn't expect that");
return oneRemoved;
}
/**
* Finds in the tree the node with the lowest penalty whose subnodes are all leaves
*
* @return the node and its penalty, or null if no such node exists
*/
private Pair<RecalDatumNode<T>, Double> getMinPenaltyAboveLeafNode() {
if ( isLeaf() )
// not allowed to remove leafs directly
return null;
if ( isAboveOnlyLeaves() )
// we only consider removing nodes above all leaves
return new Pair<RecalDatumNode<T>, Double>(this, getPenalty());
else {
// just recurse, taking the result with the min penalty of all subnodes
Pair<RecalDatumNode<T>, Double> minNode = null;
for ( final RecalDatumNode<T> sub : subnodes ) {
final Pair<RecalDatumNode<T>, Double> subFind = sub.getMinPenaltyAboveLeafNode();
if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) {
minNode = subFind;
}
}
return minNode;
}
}
/**
* Return a freshly allocated tree without the node nodeToRemove
*
* @param nodeToRemove
* @return
*/
private Pair<RecalDatumNode<T>, Boolean> removeNode(final RecalDatumNode<T> nodeToRemove) {
if ( this == nodeToRemove ) {
if ( isLeaf() )
throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove);
// node is the thing we are going to remove, but without any subnodes
final RecalDatumNode<T> node = new RecalDatumNode<T>(getRecalDatum(), fixedPenalty);
return new Pair<RecalDatumNode<T>, Boolean>(node, true);
} else {
// did we remove something in a sub branch?
boolean removedSomething = false;
// our sub nodes with the penalty node removed
final Set<RecalDatumNode<T>> sub = new HashSet<RecalDatumNode<T>>(getNumSubnodes());
for ( final RecalDatumNode<T> sub1 : subnodes ) {
if ( removedSomething ) {
// already removed something, just add sub1 back to sub
sub.add(sub1);
} else {
// haven't removed anything yet, so try
final Pair<RecalDatumNode<T>, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove);
removedSomething = maybeRemoved.getSecond();
sub.add(maybeRemoved.getFirst());
}
}
final RecalDatumNode<T> node = new RecalDatumNode<T>(getRecalDatum(), fixedPenalty, sub);
return new Pair<RecalDatumNode<T>, Boolean>(node, removedSomething);
}
}
/**
* Return a collection of all of the data in the leaf nodes of this tree
*
* @return
*/
public Collection<T> getAllLeaves() {
final LinkedList<T> list = new LinkedList<T>();
getAllLeavesRec(list);
return list;
}
/**
* Helpful recursive function for getAllLeaves()
*
* @param list the destination for the list of leaves
*/
private void getAllLeavesRec(final LinkedList<T> list) {
if ( isLeaf() )
list.add(getRecalDatum());
else {
for ( final RecalDatumNode<T> sub : subnodes )
sub.getAllLeavesRec(list);
}
}
}

View File

@ -0,0 +1,425 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import com.google.java.contract.Requires;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.report.GATKReportTable;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 27, 2009
*
* A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker.
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
*/
public class RecalibrationArgumentCollection implements Cloneable {
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out.
*/
@Gather(BQSRGatherer.class)
@Output(doc = "The output recalibration table file to create", required = true)
public File RECAL_TABLE_FILE = null;
public PrintStream RECAL_TABLE;
/**
* Note that the --list argument requires a fully resolved and correct command-line to work.
*/
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
public boolean LIST_ONLY = false;
/**
* Note that the ReadGroup and QualityScore covariates are required and do not need to be specified.
* Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default.
* Use the --list argument to see the available covariates.
*/
@Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false)
public String[] COVARIATES = null;
/*
* The Cycle and Context covariates are standard and are included by default unless this argument is provided.
* Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
*/
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Advanced
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
public boolean RUN_WITHOUT_DBSNP = false;
/**
* BaseRecalibrator accepts a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
* BaseRecalibrator accepts a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
/**
* The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false)
public int MISMATCHES_CONTEXT_SIZE = 2;
/**
* The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false)
public int INDELS_CONTEXT_SIZE = 3;
/**
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
* This argument is ignored if the Cycle covariate is not used.
*/
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false)
public int MAXIMUM_CYCLE_VALUE = 500;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off]
*/
@Advanced
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
public byte MISMATCHES_DEFAULT_QUALITY = -1;
/**
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on]
*/
@Advanced
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
public byte INSERTIONS_DEFAULT_QUALITY = 45;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on]
*/
@Advanced
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
/**
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
*/
@Advanced
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
public byte LOW_QUAL_TAIL = 2;
/**
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
* This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
*/
@Advanced
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
/**
* The tag name for the binary tag covariate (if using it)
*/
@Advanced
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
public String BINARY_TAG_NAME = null;
/**
* Whether GATK report tables should have rows in sorted order, starting from leftmost column
*/
@Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false)
public Boolean SORT_BY_ALL_COLUMNS = false;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.")
public String FORCE_READGROUP = null;
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
/**
* The repeat covariate will use a context of this size to calculate its covariate value for base insertions and deletions
*/
@Hidden
@Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false)
public int MAX_STR_UNIT_LENGTH = 8;
@Hidden
@Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false)
public int MAX_REPEAT_LENGTH = 20;
public File existingRecalibrationReport = null;
public GATKReportTable generateReportTable(final String covariateNames) {
GATKReportTable argumentsTable;
if(SORT_BY_ALL_COLUMNS) {
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
} else {
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
}
argumentsTable.addColumn("Argument");
argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
argumentsTable.addRowID("covariate", true);
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
argumentsTable.addRowID("no_standard_covs", true);
argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
argumentsTable.addRowID("run_without_dbsnp", true);
argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.addRowID("solid_recal_mode", true);
argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
argumentsTable.addRowID("solid_nocall_strategy", true);
argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.addRowID("mismatches_context_size", true);
argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.addRowID("indels_context_size", true);
argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
argumentsTable.addRowID("mismatches_default_quality", true);
argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.addRowID("deletions_default_quality", true);
argumentsTable.set("deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY);
argumentsTable.addRowID("insertions_default_quality", true);
argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
argumentsTable.addRowID("maximum_cycle_value", true);
argumentsTable.set("maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE);
argumentsTable.addRowID("low_quality_tail", true);
argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
argumentsTable.addRowID("default_platform", true);
argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.addRowID("force_platform", true);
argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.addRowID("quantizing_levels", true);
argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.addRowID("recalibration_report", true);
argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
argumentsTable.addRowID("binary_tag_name", true);
argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
return argumentsTable;
}
/**
* Returns a map with the arguments that differ between this an
* another {@link RecalibrationArgumentCollection} instance.
* <p/>
* The key is the name of that argument in the report file. The value is a message
* that explains the difference to the end user.
* <p/>
* Thus, a empty map indicates that there is no differences between both argument collection that
* is relevant to report comparison.
* <p/>
* This method should not throw any exception.
*
* @param other the argument-collection to compare against.
* @param thisRole the name used to refer to this RAC report that makes sense to the end user.
* @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
*
* @return never <code>null</code>, but a zero-size collection if there are no differences.
*/
@Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)")
public Map<String,? extends CharSequence> compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) {
final Map<String,String> result = new LinkedHashMap<>(15);
compareRequestedCovariates(result, other, thisRole, otherRole);
compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole);
compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole);
compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole);
compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole);
compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole);
compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole);
compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole);
compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole);
compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole);
compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole);
compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole);
compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole);
compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole);
compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole);
return result;
}
/**
* Compares the covariate report lists.
*
* @param diffs map where to annotate the difference.
* @param other the argument collection to compare against.
* @param thisRole the name for this argument collection that makes sense to the user.
* @param otherRole the name for the other argument collection that makes sense to the end user.
*
* @return <code>true</code> if a difference was found.
*/
@Requires("diffs != null && other != null && thisRole != null && otherRole != null")
private boolean compareRequestedCovariates(final Map<String,String> diffs,
final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) {
final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length);
final Set<String> afterNames = new HashSet<>(other.COVARIATES.length);
Utils.addAll(beforeNames, this.COVARIATES);
Utils.addAll(afterNames,other.COVARIATES);
final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size()));
intersect.addAll(beforeNames);
intersect.retainAll(afterNames);
String diffMessage = null;
if (intersect.size() == 0) { // In practice this is not possible due to required covariates but...
diffMessage = String.format("There are no common covariates between '%s' and '%s'"
+ " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole,
thisRole,Utils.join(", ",this.COVARIATES),
otherRole,Utils.join(",",other.COVARIATES));
} else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) {
beforeNames.removeAll(intersect);
afterNames.removeAll(intersect);
diffMessage = String.format("There are differences in the set of covariates requested in the"
+ " '%s' and '%s' recalibrator reports. "
+ " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole,
thisRole,Utils.join(", ",beforeNames),
otherRole,Utils.join(", ",afterNames));
}
if (diffMessage != null) {
diffs.put("covariate",diffMessage);
return true;
} else {
return false;
}
}
/**
* Annotates a map with any difference encountered in a simple value report argument that differs between this an
* another {@link RecalibrationArgumentCollection} instance.
* <p/>
* The key of the new entry would be the name of that argument in the report file. The value is a message
* that explains the difference to the end user.
* <p/>
*
* <p/>
* This method should not return any exception.
*
* @param diffs where to annotate the differences.
* @param name the name of the report argument to compare.
* @param thisValue this argument collection value for that argument.
* @param otherValue the other collection value for that argument.
* @param thisRole the name used to refer to this RAC report that makes sense to the end user.
* @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
*
* @type T the argument Object value type.
*
* @return <code>true</code> if a difference has been spotted, thus <code>diff</code> has been modified.
*/
private <T> boolean compareSimpleReportArgument(final Map<String,String> diffs,
final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) {
if (thisValue == null && otherValue == null) {
return false;
} else if (thisValue != null && thisValue.equals(otherValue)) {
return false;
} else {
diffs.put(name,
String.format("differences between '%s' {%s} and '%s' {%s}.",
thisRole,thisValue == null ? "" : thisValue,
otherRole,otherValue == null ? "" : otherValue));
return true;
}
}
/**
* Create a shallow copy of this argument collection.
*
* @return never <code>null</code>.
*/
@Override
public RecalibrationArgumentCollection clone() {
try {
return (RecalibrationArgumentCollection) super.clone();
} catch (CloneNotSupportedException e) {
throw new GATKException("Unreachable code clone not supported thrown when the class "
+ this.getClass().getName() + " is cloneable ",e);
}
}
}

View File

@ -0,0 +1,425 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import org.broadinstitute.gatk.utils.report.GATKReport;
import org.broadinstitute.gatk.utils.report.GATKReportTable;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.collections.NestedIntegerArray;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.recalibration.EventType;
import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate;
import java.io.*;
import java.util.*;
/**
* This class has all the static functionality for reading a recalibration report file into memory.
*
* @author carneiro
* @since 3/26/12
*/
public class RecalibrationReport {
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
private final RecalibrationTables recalibrationTables; // quick access reference to the tables
private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation
private final HashMap<String, Integer> optionalCovariateIndexes;
private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter
private final int[] tempRGarray = new int[2];
private final int[] tempQUALarray = new int[3];
private final int[] tempCOVarray = new int[4];
public RecalibrationReport(final File recalFile) {
this(recalFile, getReadGroups(recalFile));
}
public RecalibrationReport(final File recalFile, final SortedSet<String> allReadGroups) {
final GATKReport report = new GATKReport(recalFile);
argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
RAC = initializeArgumentCollectionTable(argumentTable);
GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE);
quantizationInfo = initializeQuantizationTable(quantizedTable);
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
optionalCovariateIndexes = new HashMap<String, Integer>(optionalCovariates.size());
int covariateIndex = 0;
for (final Covariate covariate : requiredCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (final Covariate covariate : optionalCovariates) {
requestedCovariates[covariateIndex] = covariate;
final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
optionalCovariateIndexes.put(covariateName, covariateIndex-2);
covariateIndex++;
}
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size());
initializeReadGroupCovariates(allReadGroups);
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable());
parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables);
}
/**
* Gets the unique read groups in the recal file
*
* @param recalFile the recal file as a GATK Report
* @return the unique read groups
*/
public static SortedSet<String> getReadGroups(final File recalFile) {
return getReadGroups(new GATKReport(recalFile));
}
/**
* Gets the unique read groups in the table
*
* @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE
* @return the unique read groups
*/
private static SortedSet<String> getReadGroups(final GATKReport report) {
final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
final SortedSet<String> readGroups = new TreeSet<String>();
for ( int i = 0; i < reportTable.getNumRows(); i++ )
readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString());
return readGroups;
}
/**
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
* them after combining. The reason for not calculating it is because this function is intended for combining a
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
* makes this method faster
*
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
*
* @param other the recalibration report to combine with this one
*/
public void combine(final RecalibrationReport other) {
for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) {
final NestedIntegerArray<RecalDatum> myTable = recalibrationTables.getTable(tableIndex);
final NestedIntegerArray<RecalDatum> otherTable = other.recalibrationTables.getTable(tableIndex);
RecalUtils.combineTables(myTable, otherTable);
}
}
public QuantizationInfo getQuantizationInfo() {
return quantizationInfo;
}
public RecalibrationTables getRecalibrationTables() {
return recalibrationTables;
}
public Covariate[] getRequestedCovariates() {
return requestedCovariates;
}
/**
* Initialize read group keys using the shared list of all the read groups.
*
* By using the same sorted set of read groups across all recalibration reports, even if
* one report is missing a read group, all the reports use the same read group keys.
*
* @param allReadGroups The list of all possible read groups
*/
private void initializeReadGroupCovariates(final SortedSet<String> allReadGroups) {
for (String readGroup: allReadGroups) {
requestedCovariates[0].keyFromValue(readGroup);
}
}
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
* @param reportTable the GATKReport table containing data for this table
* @param recalibrationTables the recalibration tables
\ */
private void parseAllCovariatesTable(final GATKReportTable reportTable, final RecalibrationTables recalibrationTables) {
for ( int i = 0; i < reportTable.getNumRows(); i++ ) {
final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME);
tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg);
final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME);
tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual);
final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME);
final int covIndex = optionalCovariateIndexes.get(covName);
final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME);
tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex].keyFromValue(covValue);
final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME));
tempCOVarray[3] = event.ordinal();
recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray);
}
}
/**
*
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
* @param reportTable the GATKReport table containing data for this table
* @param qualTable the map representing this table
*/
private void parseQualityScoreTable(final GATKReportTable reportTable, final NestedIntegerArray<RecalDatum> qualTable) {
for ( int i = 0; i < reportTable.getNumRows(); i++ ) {
final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME);
tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg);
final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME);
tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual);
final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME));
tempQUALarray[2] = event.ordinal();
qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray);
}
}
/**
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
*
* @param reportTable the GATKReport table containing data for this table
* @param rgTable the map representing this table
*/
private void parseReadGroupTable(final GATKReportTable reportTable, final NestedIntegerArray<RecalDatum> rgTable) {
for ( int i = 0; i < reportTable.getNumRows(); i++ ) {
final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME);
tempRGarray[0] = requestedCovariates[0].keyFromValue(rg);
final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME));
tempRGarray[1] = event.ordinal();
rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray);
}
}
private double asDouble(final Object o) {
if ( o instanceof Double )
return (Double)o;
else if ( o instanceof Integer )
return (Integer)o;
else if ( o instanceof Long )
return (Long)o;
else
throw new ReviewedGATKException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass());
}
private long asLong(final Object o) {
if ( o instanceof Long )
return (Long)o;
else if ( o instanceof Integer )
return ((Integer)o).longValue();
else if ( o instanceof Double )
return ((Double)o).longValue();
else
throw new ReviewedGATKException("Object " + o + " is expected to be a long but it's not: " + o.getClass());
}
private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) {
final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME));
//final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
// the estimatedQreported column only exists in the ReadGroup table
final double estimatedQReported = hasEstimatedQReportedColumn ?
(Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table
Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table
final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1);
datum.setEstimatedQReported(estimatedQReported);
//datum.setEmpiricalQuality(empiricalQuality); // don't set the value here because we will want to recompute with a different conditional Q score prior value
return datum;
}
/**
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
*
* @param table the GATKReportTable containing the quantization mappings
* @return an ArrayList with the quantization mappings from 0 to MAX_SAM_QUAL_SCORE
*/
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
final Byte[] quals = new Byte[QualityUtils.MAX_SAM_QUAL_SCORE + 1];
final Long[] counts = new Long[QualityUtils.MAX_SAM_QUAL_SCORE + 1];
for ( int i = 0; i < table.getNumRows(); i++ ) {
final byte originalQual = (byte)i;
final Object quantizedObject = table.get(i, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME);
final Object countObject = table.get(i, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME);
final byte quantizedQual = Byte.parseByte(quantizedObject.toString());
final long quantizedCount = Long.parseLong(countObject.toString());
quals[originalQual] = quantizedQual;
counts[originalQual] = quantizedCount;
}
return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts));
}
/**
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
*
* @param table the GATKReportTable containing the arguments and its corresponding values
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for ( int i = 0; i < table.getNumRows(); i++ ) {
final String argument = table.get(i, "Argument").toString();
Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
if (value.equals("null"))
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
if (argument.equals("covariate") && value != null)
RAC.COVARIATES = value.toString().split(",");
else if (argument.equals("standard_covs"))
RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (argument.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.recalModeFromString((String) value);
else if (argument.equals("solid_nocall_strategy"))
RAC.SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
else if (argument.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("indels_context_size"))
RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (argument.equals("insertions_default_quality"))
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (argument.equals("deletions_default_quality"))
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (argument.equals("maximum_cycle_value"))
RAC.MAXIMUM_CYCLE_VALUE = Integer.parseInt((String) value);
else if (argument.equals("low_quality_tail"))
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
else if (argument.equals("default_platform"))
RAC.DEFAULT_PLATFORM = (String) value;
else if (argument.equals("force_platform"))
RAC.FORCE_PLATFORM = (String) value;
else if (argument.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
else if (argument.equals("recalibration_report"))
RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value);
else if (argument.equals("binary_tag_name"))
RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value;
else if (argument.equals("sort_by_all_columns"))
RAC.SORT_BY_ALL_COLUMNS = Boolean.parseBoolean((String) value);
}
return RAC;
}
/**
* this functionality avoids recalculating the empirical qualities, estimated reported quality
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/
public void calculateQuantizedQualities() {
quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS);
}
/**
* Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @return newly created recalibration report
*/
public GATKReport createGATKReport() {
return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
}
public RecalibrationArgumentCollection getRAC() {
return RAC;
}
/**
*
* @deprecated use {@link #getRequestedCovariates()} instead.
*/
@Deprecated
public Covariate[] getCovariates() {
return requestedCovariates;
}
/**
* @return true if the report has no data
*/
public boolean isEmpty() {
return recalibrationTables.isEmpty();
}
}

View File

@ -0,0 +1,169 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration;
import com.google.java.contract.Ensures;
import org.broadinstitute.gatk.utils.collections.LoggingNestedIntegerArray;
import org.broadinstitute.gatk.utils.recalibration.EventType;
import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate;
import org.broadinstitute.gatk.utils.collections.NestedIntegerArray;
import java.io.PrintStream;
import java.util.ArrayList;
/**
* Utility class to facilitate on-the-fly base quality score recalibration.
*
* User: ebanks
* Date: 6/20/12
*/
public final class RecalibrationTables {
public enum TableType {
READ_GROUP_TABLE,
QUALITY_SCORE_TABLE,
OPTIONAL_COVARIATE_TABLES_START;
}
private final ArrayList<NestedIntegerArray<RecalDatum>> tables;
private final int qualDimension;
private final int eventDimension = EventType.values().length;
private final int numReadGroups;
private final PrintStream log;
public RecalibrationTables(final Covariate[] covariates) {
this(covariates, covariates[TableType.READ_GROUP_TABLE.ordinal()].maximumKeyValue() + 1, null);
}
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) {
this(covariates, numReadGroups, null);
}
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) {
tables = new ArrayList<NestedIntegerArray<RecalDatum>>(covariates.length);
for ( int i = 0; i < covariates.length; i++ )
tables.add(i, null); // initialize so we can set below
qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.ordinal()].maximumKeyValue() + 1;
this.numReadGroups = numReadGroups;
this.log = log;
tables.set(TableType.READ_GROUP_TABLE.ordinal(),
log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, "READ_GROUP_TABLE", numReadGroups, eventDimension));
tables.set(TableType.QUALITY_SCORE_TABLE.ordinal(), makeQualityScoreTable());
for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < covariates.length; i++)
tables.set(i,
log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + 1),
numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension));
}
@Ensures("result != null")
public NestedIntegerArray<RecalDatum> getReadGroupTable() {
return getTable(TableType.READ_GROUP_TABLE.ordinal());
}
@Ensures("result != null")
public NestedIntegerArray<RecalDatum> getQualityScoreTable() {
return getTable(TableType.QUALITY_SCORE_TABLE.ordinal());
}
@Ensures("result != null")
public NestedIntegerArray<RecalDatum> getTable(final int index) {
return tables.get(index);
}
@Ensures("result >= 0")
public int numTables() {
return tables.size();
}
/**
* @return true if all the tables contain no RecalDatums
*/
public boolean isEmpty() {
for( final NestedIntegerArray<RecalDatum> table : tables ) {
if( !table.getAllValues().isEmpty() ) { return false; }
}
return true;
}
/**
* Allocate a new quality score table, based on requested parameters
* in this set of tables, without any data in it. The return result
* of this table is suitable for acting as a thread-local cache
* for quality score values
* @return a newly allocated, empty read group x quality score table
*/
public NestedIntegerArray<RecalDatum> makeQualityScoreTable() {
return log == null
? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension)
: new LoggingNestedIntegerArray<RecalDatum>(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension);
}
/**
* Merge all of the tables from toMerge into into this set of tables
*/
public void combine(final RecalibrationTables toMerge) {
if ( numTables() != toMerge.numTables() )
throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes");
for ( int i = 0; i < numTables(); i++ ) {
final NestedIntegerArray<RecalDatum> myTable = this.getTable(i);
final NestedIntegerArray<RecalDatum> otherTable = toMerge.getTable(i);
RecalUtils.combineTables(myTable, otherTable);
}
}
}

View File

@ -0,0 +1,304 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation;
import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 9/26/11
*/
public class ContextCovariate implements StandardCovariate {
private final static Logger logger = Logger.getLogger(ContextCovariate.class);
private int mismatchesContextSize;
private int indelsContextSize;
private int mismatchesKeyMask;
private int indelsKeyMask;
private static final int LENGTH_BITS = 4;
private static final int LENGTH_MASK = 15;
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
static final private int MAX_DNA_CONTEXT = 13;
private byte LOW_QUAL_TAIL;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE;
indelsContextSize = RAC.INDELS_CONTEXT_SIZE;
logger.info("\t\tContext sizes: base substitution model " + mismatchesContextSize + ", indel substitution model " + indelsContextSize);
if (mismatchesContextSize > MAX_DNA_CONTEXT)
throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize));
if (indelsContextSize > MAX_DNA_CONTEXT)
throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize));
LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || indelsContextSize <= 0)
throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize));
mismatchesKeyMask = createMask(mismatchesContextSize);
indelsKeyMask = createMask(indelsContextSize);
}
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
// store the original bases and then write Ns over low quality ones
final byte[] originalBases = read.getReadBases().clone();
// Write N's over the low quality tail of the reads to avoid adding them into the context
final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag();
byte[] bases = clippedRead.getReadBases();
if (negativeStrand)
bases = BaseUtils.simpleReverseComplement(bases);
final ArrayList<Integer> mismatchKeys = contextWith(bases, mismatchesContextSize, mismatchesKeyMask);
final ArrayList<Integer> indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask);
final int readLength = bases.length;
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
// since the context covariate may not span the entire set of values in read covariates
// due to the clipping of the low quality bases
if ( readLength != originalBases.length ) {
// don't both zeroing out if we are going to overwrite the whole array
for ( int i = 0; i < originalBases.length; i++ )
// this base has been clipped off, so zero out the covariate values here
values.addCovariate(0, 0, 0, i);
}
for (int i = 0; i < readLength; i++) {
final int readOffset = (negativeStrand ? readLength - i - 1 : i);
final int indelKey = indelKeys.get(i);
values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset);
}
// put the original bases back in
read.setReadBases(originalBases);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return str;
}
@Override
public String formatKey(final int key) {
if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file
return null;
return contextFromKey(key);
}
@Override
public int keyFromValue(final Object value) {
return keyFromContext((String) value);
}
private static int createMask(final int contextSize) {
int mask = 0;
// create 2*contextSize worth of bits
for (int i = 0; i < contextSize; i++)
mask = (mask << 2) | 3;
// shift 4 bits to mask out the bits used to encode the length
return mask << LENGTH_BITS;
}
/**
* calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion)
*
* @param bases the bases in the read to build the context from
* @param contextSize context size to use building the context
* @param mask mask for pulling out just the context bits
*/
private static ArrayList<Integer> contextWith(final byte[] bases, final int contextSize, final int mask) {
final int readLength = bases.length;
final ArrayList<Integer> keys = new ArrayList<Integer>(readLength);
// the first contextSize-1 bases will not have enough previous context
for (int i = 1; i < contextSize && i <= readLength; i++)
keys.add(-1);
if (readLength < contextSize)
return keys;
final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
// get (and add) the key for the context starting at the first base
int currentKey = keyFromContext(bases, 0, contextSize);
keys.add(currentKey);
// if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects
int currentNPenalty = 0;
if (currentKey == -1) {
currentKey = 0;
currentNPenalty = contextSize - 1;
int offset = newBaseOffset;
while (bases[currentNPenalty] != 'N') {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]);
currentKey |= (baseIndex << offset);
offset -= 2;
currentNPenalty--;
}
}
for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]);
if (baseIndex == -1) { // ignore non-ACGT bases
currentNPenalty = contextSize;
currentKey = 0; // reset the key
} else {
// push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in
currentKey = (currentKey >> 2) & mask;
currentKey |= (baseIndex << newBaseOffset);
currentKey |= contextSize;
}
if (currentNPenalty == 0) {
keys.add(currentKey);
} else {
currentNPenalty--;
keys.add(-1);
}
}
return keys;
}
public static int keyFromContext(final String dna) {
return keyFromContext(dna.getBytes(), 0, dna.length());
}
/**
* Creates a int representation of a given dna string.
*
* @param dna the dna sequence
* @param start the start position in the byte array (inclusive)
* @param end the end position in the array (exclusive)
* @return the key representing the dna sequence
*/
private static int keyFromContext(final byte[] dna, final int start, final int end) {
int key = end - start;
int bitOffset = LENGTH_BITS;
for (int i = start; i < end; i++) {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]);
if (baseIndex == -1) // ignore non-ACGT bases
return -1;
key |= (baseIndex << bitOffset);
bitOffset += 2;
}
return key;
}
/**
* Converts a key into the dna string representation.
*
* @param key the key representing the dna sequence
* @return the dna sequence represented by the key
*/
public static String contextFromKey(final int key) {
if (key < 0)
throw new ReviewedGATKException("dna conversion cannot handle negative numbers. Possible overflow?");
final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context
int mask = 48; // use the mask to pull out bases
int offset = LENGTH_BITS;
StringBuilder dna = new StringBuilder();
for (int i = 0; i < length; i++) {
final int baseIndex = (key & mask) >> offset;
dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex));
mask = mask << 2; // move the mask over to the next 2 bits
offset += 2;
}
return dna.toString();
}
@Override
public int maximumKeyValue() {
// the maximum value is T (11 in binary) for each base in the context
int length = Math.max(mismatchesContextSize, indelsContextSize); // the length of the context
int key = length;
int bitOffset = LENGTH_BITS;
for (int i = 0; i <length ; i++) {
key |= (3 << bitOffset);
bitOffset += 2;
}
return key;
}
}

View File

@ -0,0 +1,144 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
*/
public interface Covariate {
/**
* Initialize any member variables using the command-line arguments passed to the walker
*
* @param RAC the recalibration argument collection
*/
public void initialize(final RecalibrationArgumentCollection RAC);
/**
* Calculates covariate values for all positions in the read.
*
* @param read the read to calculate the covariates on.
* @param values the object to record the covariate values for every base in the read.
*/
public void recordValues(final GATKSAMRecord read, final ReadCovariates values);
/**
* Used to get the covariate's value from input (Recalibration Report) file during on-the-fly recalibration
*
* @param str the key in string type (read from the csv)
* @return the key in it's correct type.
*/
public Object getValue(final String str);
/**
* Converts the internal representation of the key to String format for file output.
*
* @param key the long representation of the key
* @return a string representation of the key
*/
public String formatKey(final int key);
/**
* Converts an Object key into a long key using only the lowest numberOfBits() bits
*
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in long format. For counting covariates
* the getValues method already returns all values in long format.
*
* @param value the object corresponding to the covariate
* @return a long representation of the object
*/
public int keyFromValue(final Object value);
/**
* Returns the maximum value possible for any key representing this covariate
*
* @return the maximum value possible for any key representing this covariate
*/
public int maximumKeyValue();
}

View File

@ -0,0 +1,282 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.NGSPlatform;
import org.broadinstitute.gatk.utils.SequencerFlowClass;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Cycle covariate.
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
* For example, for the read: AAACCCCGAAATTTTTACTG
* the cycle would be 11111111222333333344
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
*/
public class CycleCovariate implements StandardCovariate {
private int MAXIMUM_CYCLE_VALUE;
public static final int CUSHION_FOR_INDELS = 4;
private String default_platform = null;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE;
if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM))
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform.");
if (RAC.DEFAULT_PLATFORM != null)
default_platform = RAC.DEFAULT_PLATFORM;
}
// Used to pick out the covariate's value from attributes of the read
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final int readLength = read.getReadLength();
final NGSPlatform ngsPlatform = default_platform == null ? read.getNGSPlatform() : NGSPlatform.fromReadGroupPL(default_platform);
// Discrete cycle platforms
if (ngsPlatform.getSequencerType() == SequencerFlowClass.DISCRETE) {
final int readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? -1 : 1;
final int increment;
int cycle;
if (read.getReadNegativeStrandFlag()) {
cycle = readLength * readOrderFactor;
increment = -1 * readOrderFactor;
}
else {
cycle = readOrderFactor;
increment = readOrderFactor;
}
final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1;
for (int i = 0; i < readLength; i++) {
final int substitutionKey = keyFromCycle(cycle);
final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey;
values.addCovariate(substitutionKey, indelKey, indelKey, i);
cycle += increment;
}
}
// Flow cycle platforms
else if (ngsPlatform.getSequencerType() == SequencerFlowClass.FLOW) {
final byte[] bases = read.getReadBases();
// Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
// Therefore the cycle covariate must differentiate between first and second of pair reads.
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
// the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
if (!read.getReadNegativeStrandFlag()) { // Forward direction
int iii = 0;
while (iii < readLength) {
while (iii < readLength && bases[iii] == (byte) 'T') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'A') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'C') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'G') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
if (iii < readLength) {
if (multiplyByNegative1)
cycle--;
else
cycle++;
}
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
}
}
else { // Negative direction
int iii = readLength - 1;
while (iii >= 0) {
while (iii >= 0 && bases[iii] == (byte) 'T') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'A') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'C') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'G') {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
if (iii >= 0) {
if (multiplyByNegative1)
cycle--;
else
cycle++;
}
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
final int key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
}
}
}
// Unknown platforms
else {
throw new UserException("The platform (" + read.getReadGroup().getPlatform()
+ ") associated with read group " + read.getReadGroup()
+ " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString());
}
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return Integer.parseInt(str);
}
@Override
public String formatKey(final int key) {
int cycle = key >> 1; // shift so we can remove the "sign" bit
if ( (key & 1) != 0 ) // is the last bit set?
cycle *= -1; // then the cycle is negative
return String.format("%d", cycle);
}
@Override
public int keyFromValue(final Object value) {
return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value);
}
@Override
public int maximumKeyValue() {
return (MAXIMUM_CYCLE_VALUE << 1) + 1;
}
private int keyFromCycle(final int cycle) {
// no negative values because values must fit into the first few bits of the long
int result = Math.abs(cycle);
if ( result > MAXIMUM_CYCLE_VALUE )
throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle (" + result + ") was detected. Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)");
result = result << 1; // shift so we can add the "sign" bit
if ( cycle < 0 )
result++; // negative cycles get the lower-most bit set
return result;
}
}

View File

@ -0,0 +1,81 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h3>Input</h3>
* <p>
* [Input description]
* </p>
* <p/>
* <h3>Output</h3>
* <p>
* [Output description]
* </p>
* <p/>
* <h3>Examples</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public interface ExperimentalCovariate extends Covariate {}

View File

@ -0,0 +1,129 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* The Reported Quality Score covariate.
*/
public class QualityScoreCovariate implements RequiredCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {}
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final byte[] baseQualities = read.getBaseQualities();
final byte[] baseInsertionQualities = read.getBaseInsertionQualities();
final byte[] baseDeletionQualities = read.getBaseDeletionQualities();
for (int i = 0; i < baseQualities.length; i++) {
values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i);
}
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return Byte.parseByte(str);
}
@Override
public String formatKey(final int key) {
return String.format("%d", key);
}
@Override
public int keyFromValue(final Object value) {
return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value;
}
@Override
public int maximumKeyValue() {
return QualityUtils.MAX_SAM_QUAL_SCORE;
}
}

View File

@ -0,0 +1,190 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Read Group covariate.
*/
public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Integer> readGroupLookupTable = new HashMap<String, Integer>();
private final HashMap<Integer, String> readGroupReverseLookupTable = new HashMap<Integer, String>();
private int nextId = 0;
private String forceReadGroup;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
forceReadGroup = RAC.FORCE_READGROUP;
}
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final String readGroupId = readGroupValueFromRG(read.getReadGroup());
final int key = keyForReadGroup(readGroupId);
final int l = read.getReadLength();
for (int i = 0; i < l; i++)
values.addCovariate(key, key, key, i);
}
@Override
public final Object getValue(final String str) {
return str;
}
@Override
public synchronized String formatKey(final int key) {
// This method is synchronized so that we don't attempt to do a get()
// from the reverse lookup table while that table is being updated
return readGroupReverseLookupTable.get(key);
}
@Override
public int keyFromValue(final Object value) {
return keyForReadGroup((String) value);
}
/**
* Get the mapping from read group names to integer key values for all read groups in this covariate
* @return a set of mappings from read group names -> integer key values
*/
public Set<Map.Entry<String, Integer>> getKeyMap() {
return readGroupLookupTable.entrySet();
}
private int keyForReadGroup(final String readGroupId) {
// Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR),
// synchronize only the table updates.
// Before entering the synchronized block, check to see if this read group is not in our tables.
// If it's not, either we will have to insert it, OR another thread will insert it first.
// This preliminary check avoids doing any synchronization most of the time.
if ( ! readGroupLookupTable.containsKey(readGroupId) ) {
synchronized ( this ) {
// Now we need to make sure the key is STILL not there, since another thread may have come along
// and inserted it while we were waiting to enter this synchronized block!
if ( ! readGroupLookupTable.containsKey(readGroupId) ) {
readGroupLookupTable.put(readGroupId, nextId);
readGroupReverseLookupTable.put(nextId, readGroupId);
nextId++;
}
}
}
return readGroupLookupTable.get(readGroupId);
}
@Override
public synchronized int maximumKeyValue() {
// Synchronized so that we don't query table size while the tables are being updated
return readGroupLookupTable.size() - 1;
}
/**
* If the sample has a PU tag annotation, return that. If not, return the read group id.
*
* @param rg the read group record
* @return platform unit or readgroup id
*/
private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) {
if ( forceReadGroup != null )
return forceReadGroup;
final String platformUnit = rg.getPlatformUnit();
return platformUnit == null ? rg.getId() : platformUnit;
}
}

View File

@ -0,0 +1,285 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.collections.Pair;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public abstract class RepeatCovariate implements ExperimentalCovariate {
protected int MAX_REPEAT_LENGTH;
protected int MAX_STR_UNIT_LENGTH;
private final HashMap<String, Integer> repeatLookupTable = new HashMap<String, Integer>();
private final HashMap<Integer, String> repeatReverseLookupTable = new HashMap<Integer, String>();
private int nextId = 0;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
MAX_STR_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH;
MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH;
}
public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) {
this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH;
this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH;
}
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
// store the original bases and then write Ns over low quality ones
final byte[] originalBases = read.getReadBases().clone();
final boolean negativeStrand = read.getReadNegativeStrandFlag();
byte[] bases = read.getReadBases();
if (negativeStrand)
bases = BaseUtils.simpleReverseComplement(bases);
// don't record reads with N's
if (!BaseUtils.isAllRegularBases(bases))
return;
for (int i = 0; i < bases.length; i++) {
final Pair<byte[], Integer> res = findTandemRepeatUnits(bases, i);
// to merge repeat unit and repeat length to get covariate value:
final String repeatID = getCovariateValueFromUnitAndLength(res.first, res.second);
final int key = keyForRepeat(repeatID);
final int readOffset = (negativeStrand ? bases.length - i - 1 : i);
values.addCovariate(key, key, key, readOffset);
}
// put the original bases back in
read.setReadBases(originalBases);
}
public Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) {
int maxBW = 0;
byte[] bestBWRepeatUnit = new byte[]{readBases[offset]};
for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) {
// fix repeat unit length
//edge case: if candidate tandem repeat unit falls beyond edge of read, skip
if (offset+1-str < 0)
break;
// get backward repeat unit and # repeats
byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1);
maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
if (maxBW > 1) {
bestBWRepeatUnit = backwardRepeatUnit.clone();
break;
}
}
byte[] bestRepeatUnit = bestBWRepeatUnit;
int maxRL = maxBW;
if (offset < readBases.length-1) {
byte[] bestFWRepeatUnit = new byte[]{readBases[offset+1]};
int maxFW = 0;
for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) {
// fix repeat unit length
//edge case: if candidate tandem repeat unit falls beyond edge of read, skip
if (offset+str+1 > readBases.length)
break;
// get forward repeat unit and # repeats
byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1);
maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true);
if (maxFW > 1) {
bestFWRepeatUnit = forwardRepeatUnit.clone();
break;
}
}
// if FW repeat unit = BW repeat unit it means we're in the middle of a tandem repeat - add FW and BW components
if (Arrays.equals(bestFWRepeatUnit, bestBWRepeatUnit)) {
maxRL = maxBW + maxFW;
bestRepeatUnit = bestFWRepeatUnit; // arbitrary
}
else {
// tandem repeat starting forward from current offset.
// It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit.
// For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3.
// but correct representation at that place might be (C)4.
// Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add
// representations to total
maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
maxRL = maxFW + maxBW;
bestRepeatUnit = bestFWRepeatUnit;
}
}
if(maxRL > MAX_REPEAT_LENGTH) { maxRL = MAX_REPEAT_LENGTH; }
return new Pair<byte[], Integer>(bestRepeatUnit, maxRL);
}
@Override
public final Object getValue(final String str) {
return str;
}
@Override
public synchronized String formatKey(final int key) {
// This method is synchronized so that we don't attempt to do a get()
// from the reverse lookup table while that table is being updated
return repeatReverseLookupTable.get(key);
}
@Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"})
@Ensures("result != null")
protected abstract String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength);
@Override
public int keyFromValue(final Object value) {
return keyForRepeat((String) value);
}
/**
* Get the mapping from read group names to integer key values for all read groups in this covariate
* @return a set of mappings from read group names -> integer key values
*/
public Set<Map.Entry<String, Integer>> getKeyMap() {
return repeatLookupTable.entrySet();
}
private int keyForRepeat(final String repeatID) {
// Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR),
// synchronize only the table updates.
// Before entering the synchronized block, check to see if this read group is not in our tables.
// If it's not, either we will have to insert it, OR another thread will insert it first.
// This preliminary check avoids doing any synchronization most of the time.
if ( ! repeatLookupTable.containsKey(repeatID) ) {
synchronized ( this ) {
// Now we need to make sure the key is STILL not there, since another thread may have come along
// and inserted it while we were waiting to enter this synchronized block!
if ( ! repeatLookupTable.containsKey(repeatID) ) {
repeatLookupTable.put(repeatID, nextId);
repeatReverseLookupTable.put(nextId, repeatID);
nextId++;
}
}
}
return repeatLookupTable.get(repeatID);
}
/**
* Splits repeat unit and num repetitions from covariate value.
* For example, if value if "ATG4" it returns (ATG,4)
* @param value Covariate value
* @return Split pair
*/
@Requires("value != null")
@Ensures({"result.first != null","result.second>=0"})
public static Pair<String,Integer> getRUandNRfromCovariate(final String value) {
int k = 0;
for ( k=0; k < value.length(); k++ ) {
if (!BaseUtils.isRegularBase(value.getBytes()[k]))
break;
}
Integer nr = Integer.valueOf(value.substring(k,value.length())); // will throw NumberFormatException if format illegal
if (k == value.length() || nr <= 0)
throw new IllegalStateException("Covariate is not of form (Repeat Unit) + Integer");
return new Pair<String,Integer>(value.substring(0,k), nr);
}
/**
* Gets bases from tandem repeat representation (Repeat Unit),(Number of Repeats).
* For example, (AGC),3 returns AGCAGCAGC
* @param repeatUnit Tandem repeat unit
* @param numRepeats Number of repeats
* @return Expanded String
*/
@Requires({"numRepeats > 0","repeatUnit != null"})
@Ensures("result != null")
public static String getBasesFromRUandNR(final String repeatUnit, final int numRepeats) {
final StringBuilder sb = new StringBuilder();
for (int i=0; i < numRepeats; i++)
sb.append(repeatUnit);
return sb.toString();
}
// version given covariate key
public static String getBasesFromRUandNR(final String covariateValue) {
Pair<String,Integer> pair = getRUandNRfromCovariate(covariateValue);
return getBasesFromRUandNR(pair.getFirst(), pair.getSecond());
}
@Override
public abstract int maximumKeyValue();
}

View File

@ -0,0 +1,74 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
public class RepeatLengthCovariate extends RepeatCovariate {
@Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"})
@Ensures("result != null")
protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) {
return String.format("%d",repeatLength);
}
@Override
public synchronized int maximumKeyValue() {
// Synchronized so that we don't query table size while the tables are being updated
//return repeatLookupTable.size() - 1;
// max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH,
// so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values
return (1+MAX_REPEAT_LENGTH);
}
}

View File

@ -0,0 +1,75 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
public class RepeatUnitAndLengthCovariate extends RepeatCovariate {
@Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"})
@Ensures("result != null")
protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) {
return new String(repeatFromUnitAndLength) + String.format("%d",repeatLength);
}
@Override
public synchronized int maximumKeyValue() {
// Synchronized so that we don't query table size while the tables are being updated
//return repeatLookupTable.size() - 1;
// max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH,
// so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values
return (1<<(2*MAX_STR_UNIT_LENGTH)) * MAX_REPEAT_LENGTH +1;
}
}

View File

@ -0,0 +1,78 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 11/3/12
*/
public class RepeatUnitCovariate extends RepeatCovariate {
protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) {
return new String(repeatFromUnitAndLength);
}
@Override
public synchronized int maximumKeyValue() {
// Synchronized so that we don't query table size while the tables are being updated
//return repeatLookupTable.size() - 1;
// max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH,
// so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values
return (1<<(2*MAX_STR_UNIT_LENGTH)) +1;
}
}

View File

@ -0,0 +1,81 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h3>Input</h3>
* <p>
* [Input description]
* </p>
* <p/>
* <h3>Output</h3>
* <p>
* [Output description]
* </p>
* <p/>
* <h3>Examples</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public interface RequiredCovariate extends Covariate {}

View File

@ -0,0 +1,81 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.engine.recalibration.covariates;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h3>Input</h3>
* <p>
* [Input description]
* </p>
* <p/>
* <h3>Output</h3>
* <p>
* [Output description]
* </p>
* <p/>
* <h3>Examples</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public interface StandardCovariate extends Covariate {}

View File

@ -54,21 +54,19 @@ package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.Collections;
@ -77,17 +75,17 @@ import java.util.List;
/**
* Allele count and frequency expectation per sample
*
* Needs documentation
* <p>This annotation calculates the maximum likelihood (ML) number and frequency of alternate alleles for each individual sample at a site. In essence, it is equivalent to calculating the sum of "1"s in a genotype (for a biallelic site).</p>
*
*/
@SuppressWarnings("unused")
public final class AlleleCountBySample extends GenotypeAnnotation {
private final static List<String> keyNames = Collections.unmodifiableList(Arrays.asList(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY,VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY));
private final static List<String> keyNames = Collections.unmodifiableList(Arrays.asList(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY,GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY));
private final static List<VCFFormatHeaderLine> descriptors = Collections.unmodifiableList(Arrays.asList(
new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample"),
new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")
GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY),
GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY)
));
@Override
@ -121,8 +119,8 @@ public final class AlleleCountBySample extends GenotypeAnnotation {
AC[alleleIndex - 1] = alleleCount;
AF[alleleIndex - 1] = ((double) alleleCount) / (double) ploidy;
}
gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, Utils.asList(AC));
gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, Utils.asList(AF));
gb.attribute(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, AC);
gb.attribute(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, AF);
}
@Override

View File

@ -0,0 +1,118 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Genotype;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
public class AnnotationUtils {
/**
* Checks if the input data is appropriate
*
* @param walker input walker
* @param map input map for each read, holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
* @param g input genotype
* @param warningsLogged array that enforces the warning is logged once for each caller
* @param logger logger specific for each caller
*
* @return true if the walker is a HaplotypeCaller, the likelihood map is non-null and the genotype is non-null and called, false otherwise
* @throws ReviewedGATKException if the size of warningsLogged is less than 4.
*/
public static boolean isAppropriateInput(final AnnotatorCompatible walker, final PerReadAlleleLikelihoodMap map, final Genotype g, final boolean[] warningsLogged, final Logger logger) {
if ( warningsLogged.length < 4 ){
throw new ReviewedGATKException("Warnings logged array must have at last 4 elements, but has " + warningsLogged.length);
}
if ( !(walker instanceof HaplotypeCaller) ) {
if ( !warningsLogged[0] ) {
if ( walker != null )
logger.warn("Annotation will not be calculated, must be called from HaplotyepCaller, not " + walker.getClass().getName());
else
logger.warn("Annotation will not be calculated, must be called from HaplotyepCaller");
warningsLogged[0] = true;
}
return false;
}
if ( map == null ){
if ( !warningsLogged[1] ) {
logger.warn("Annotation will not be calculated, can only be used with likelihood based annotations in the HaplotypeCaller");
warningsLogged[1] = true;
}
return false;
}
if ( g == null ){
if ( !warningsLogged[2] ) {
logger.warn("Annotation will not be calculated, missing genotype");
warningsLogged[2]= true;
}
return false;
}
if ( !g.isCalled() ){
if ( !warningsLogged[3] ) {
logger.warn("Annotation will not be calculated, genotype is not called");
warningsLogged[3] = true;
}
return false;
}
return true;
}
}

View File

@ -54,15 +54,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Rank Sum Test of REF vs. ALT base quality scores
* Rank Sum Test of REF versus ALT base quality scores
*
* <p>This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.</p>
*
@ -75,10 +76,10 @@ import java.util.*;
*/
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
@Override
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_QUAL_RANK_SUM_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
@Override
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {

View File

@ -52,9 +52,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
@ -65,6 +65,7 @@ import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextUtils;
import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants;
import java.util.*;

View File

@ -51,16 +51,16 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Rank Sum Test for hard-clipped bases on REF vs. ALT reads
* Rank Sum Test for hard-clipped bases on REF versus ALT reads
*
* <p>This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.</p>
*
@ -68,15 +68,15 @@ import java.util.*;
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveat</h3>
* <p>The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
* <p>The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
*
*/
public class ClippingRankSumTest extends RankSumTest {
@Override
public List<String> getKeyNames() { return Arrays.asList("ClippingRankSum"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.CLIPPING_RANK_SUM_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
@Override
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
@ -70,7 +70,7 @@ import java.util.List;
import java.util.Map;
/**
* Total depth of coverage per sample (in FORMAT) and over all samples (in INFO).
* Total depth of coverage per sample and over all samples.
*
* <p>This annotation is used to provide counts of read depth at two different levels, with some important differences. At the sample level (FORMAT), the DP value is the count of reads that passed the caller's internal quality control metrics (such as MAPQ > 17, for example). At the site level (INFO), the DP value is the unfiltered depth over all samples.</p>
*
@ -78,7 +78,7 @@ import java.util.Map;
*
* <h3>Caveats</h3>
* <ul>
* <li>If downsampling is enabled (as is done by default for some analyses to remove excessive coverage), the depth of coverage effectively seen by the caller may be inferior to the actual depth of coverage in the original file. If using `-dcov D`, the maximum depth that can be seen for N samples will be N * D.</li>
* <li>If downsampling is enabled (as is done by default for some analyses to remove excessive coverage), the depth of coverage effectively seen by the caller may be inferior to the actual depth of coverage in the original file. If using "-dcov D", the maximum depth that can be seen for N samples will be N * D.</li>
* </ul>
*
* <h3>Related annotations</h3>

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
@ -90,8 +90,9 @@ import java.util.*;
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalance.php">AlleleBallance</a></b> is a generalization of this annotation over all samples.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalanceBySample.php">AlleleBallanceBySample</a></b> calculates allele balance for each individual sample.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalance.php">AlleleBalance</a></b> is a generalization of this annotation over all samples.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalanceBySample.php">AlleleBalanceBySample</a></b> calculates allele balance for each individual sample.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandAlleleCountsBySample.php">StrandAlleleCountsBySample</a></b> stratifies depths by allele and strand.</li>
* </ul>
*/
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {

View File

@ -51,13 +51,14 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
@ -91,6 +92,11 @@ import java.util.*;
*
*/
public class DepthPerSampleHC extends GenotypeAnnotation {
private final static Logger logger = Logger.getLogger(DepthPerSampleHC.class);
private boolean alleleLikelihoodMapSubsetWarningLogged = false;
boolean[] warningsLogged = new boolean[4];
@Override
public void annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
@ -98,26 +104,29 @@ public class DepthPerSampleHC extends GenotypeAnnotation {
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
return;
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
if (alleleLikelihoodMap == null )
throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller");
if ( !AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger) ) {
return;
}
// the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot
// differentiate between reads that align over the event but aren't informative vs. those that aren't even
// close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
int dp = 0;
if ( alleleLikelihoodMap.isEmpty() ) {
// there are no reads
} else {
// there are reads
if ( !alleleLikelihoodMap.isEmpty() ) {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet());
if ( !alleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) {
if ( !alleleLikelihoodMapSubsetWarningLogged ) {
logger.warn("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet());
alleleLikelihoodMapSubsetWarningLogged = true;
}
return;
}
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
@ -130,10 +139,12 @@ public class DepthPerSampleHC extends GenotypeAnnotation {
}
}
@Override
public List<String> getKeyNames() {
return Collections.singletonList(VCFConstants.DEPTH_KEY);
}
@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
}

View File

@ -52,24 +52,17 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import cern.jet.math.Arithmetic;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.QualityUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
@ -99,10 +92,9 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
private final static boolean ENABLE_DEBUGGING = false;
private final static Logger logger = Logger.getLogger(FisherStrand.class);
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
private static final int MIN_COUNT = 2;
private static final int MIN_COUNT = ARRAY_DIM;
@Override
protected Map<String, Object> calculateAnnotationFromGTfield(final GenotypesContext genotypes){
@ -113,8 +105,8 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
@Override
protected Map<String, Object> calculateAnnotationFromStratifiedContexts(final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc){
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1, MIN_COUNT);
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT);
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT);
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT);
printTable("unfiltered", tableNoFiltering);
printTable("filtered", tableFiltering);
return pValueForBestTable(tableFiltering, tableNoFiltering);
@ -159,15 +151,17 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
*/
protected Map<String, Object> annotationForOneTable(final double pValue) {
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
return Collections.singletonMap(FS, value);
return Collections.singletonMap(getKeyNames().get(0), value);
}
@Override
public List<String> getKeyNames() {
return Collections.singletonList(FS);
return Collections.singletonList(GATKVCFConstants.FISHER_STRAND_KEY);
}
@Override
public List<VCFInfoHeaderLine> getDescriptions() {
return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
}
/**
@ -176,16 +170,20 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
* @return the array used by the per-sample Strand Bias annotation
*/
public static List<Integer> getContingencyArray( final int[][] table ) {
if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
final List<Integer> list = new ArrayList<>(4); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change
if(table.length != ARRAY_DIM || table[0].length != ARRAY_DIM) {
logger.warn("Expecting a " + ARRAY_DIM + "x" + ARRAY_DIM + " strand bias table.");
return null;
}
final List<Integer> list = new ArrayList<>(ARRAY_SIZE); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change
list.add(table[0][0]);
list.add(table[0][1]);
list.add(table[1][0]);
list.add(table[1][1]);
return list;
}
private Double pValueForContingencyTable(int[][] originalTable) {
public static Double pValueForContingencyTable(int[][] originalTable) {
final int[][] normalizedTable = normalizeContingencyTable(originalTable);
int[][] table = copyContingencyTable(normalizedTable);
@ -239,9 +237,9 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE;
final int[][] normalized = new int[2][2];
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2; j++ )
final int[][] normalized = new int[ARRAY_DIM][ARRAY_DIM];
for ( int i = 0; i < ARRAY_DIM; i++ ) {
for ( int j = 0; j < ARRAY_DIM; j++ )
normalized[i][j] = (int)(table[i][j] / normalizationFactor);
}
@ -249,10 +247,10 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
}
private static int [][] copyContingencyTable(int [][] t) {
int[][] c = new int[2][2];
int[][] c = new int[ARRAY_DIM][ARRAY_DIM];
for ( int i = 0; i < 2; i++ )
for ( int j = 0; j < 2; j++ )
for ( int i = 0; i < ARRAY_DIM; i++ )
for ( int j = 0; j < ARRAY_DIM; j++ )
c[i][j] = t[i][j];
return c;
@ -271,28 +269,28 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
*/
private void printTable(final String name, final int[][] table) {
if ( ENABLE_DEBUGGING ) {
final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS);
final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(getKeyNames().get(0));
logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s",
name, table[0][0], table[0][1], table[1][0], table[1][1], pValue));
}
}
private static boolean rotateTable(int[][] table) {
table[0][0] -= 1;
table[1][0] += 1;
table[0][0]--;
table[1][0]++;
table[0][1] += 1;
table[1][1] -= 1;
table[0][1]++;
table[1][1]--;
return (table[0][0] >= 0 && table[1][1] >= 0);
}
private static boolean unrotateTable(int[][] table) {
table[0][0] += 1;
table[1][0] -= 1;
table[0][0]++;
table[1][0]--;
table[0][1] -= 1;
table[1][1] += 1;
table[0][1]--;
table[1][1]++;
return (table[0][1] >= 0 && table[1][0] >= 0);
}

View File

@ -51,20 +51,17 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.HashMap;
@ -89,14 +86,14 @@ public class GCContent extends InfoFieldAnnotation {
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
double content = computeGCContent(ref);
Map<String, Object> map = new HashMap<String, Object>();
Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.2f", content));
return map;
}
public List<String> getKeyNames() { return Arrays.asList("GC"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.GC_CONTENT_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content around the variant (see docs for window size details)")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));}
public boolean useZeroQualityReads() { return false; }

View File

@ -51,36 +51,39 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Genotype summary statistics
* Summarize genotype statistics from all samples at the site level
*
* <p>This annotation collects several genotype-level statistics from all samples and summarizes them in the INFO field. The following statistics are collected:</p>
* <ul>
* <li>Number of called chromosomes (should amount to ploidy * called samples)</li>
* <li>Number of no-called samples</li>
* <li>p-value from Hardy-Weinberg Equilibrium test</li>
* <li>Mean of all GQ values</li>
* <li>Standard deviation of all GQ values</li>
* </ul>
* <h3>Note</h3>
* <p>These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.</p>
*/
public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
public final static String CCC = "CCC";
public final static String NCC = "NCC";
public final static String HWP = "HWP";
public final static String GQ_MEAN = "GQ_MEAN";
public final static String GQ_STDDEV = "GQ_STDDEV";
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -92,7 +95,7 @@ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegi
return null;
final Map<String,Object> returnMap = new HashMap<>();
returnMap.put(NCC, vc.getNoCallCount());
returnMap.put(GATKVCFConstants.NOCALL_CHROM_KEY, vc.getNoCallCount());
final MathUtils.RunningAverage average = new MathUtils.RunningAverage();
for( final Genotype g : vc.getGenotypes() ) {
@ -101,9 +104,9 @@ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegi
}
}
if( average.observationCount() > 0L ) {
returnMap.put(GQ_MEAN, String.format("%.2f", average.mean()));
returnMap.put(GATKVCFConstants.GQ_MEAN_KEY, String.format("%.2f", average.mean()));
if( average.observationCount() > 1L ) {
returnMap.put(GQ_STDDEV, String.format("%.2f", average.stddev()));
returnMap.put(GATKVCFConstants.GQ_STDEV_KEY, String.format("%.2f", average.stddev()));
}
}
@ -112,17 +115,9 @@ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegi
@Override
public List<String> getKeyNames() {
return Arrays.asList(CCC, NCC, HWP, GQ_MEAN, GQ_STDDEV);
}
@Override
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(
new VCFInfoHeaderLine(CCC, 1, VCFHeaderLineType.Integer, "Number of called chromosomes"),
new VCFInfoHeaderLine(NCC, 1, VCFHeaderLineType.Integer, "Number of no-called samples"),
new VCFInfoHeaderLine(HWP, 1, VCFHeaderLineType.Float, "P value from test of Hardy Weinberg Equilibrium"),
new VCFInfoHeaderLine(GQ_MEAN, 1, VCFHeaderLineType.Float, "Mean of all GQ values"),
new VCFInfoHeaderLine(GQ_STDDEV, 1, VCFHeaderLineType.Float, "Standard deviation of all GQ values")
);
GATKVCFConstants.NOCALL_CHROM_KEY,
GATKVCFConstants.GQ_MEAN_KEY,
GATKVCFConstants.GQ_STDEV_KEY);
}
}

View File

@ -50,11 +50,12 @@
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
@ -63,16 +64,16 @@ import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.io.Serializable;
import java.util.*;
@ -87,17 +88,33 @@ import java.util.*;
* <p>HaplotypeCaller does not output this annotation because it already evaluates haplotype segregation internally. This annotation is only informative (and available) for variants called by Unified Genotyper.</p>
*/
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(HaplotypeScore.class);
private boolean walkerIdentityCheckWarningLogged = false;
private final static boolean DEBUG = false;
private final static int MIN_CONTEXT_WING_SIZE = 10;
private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50;
private final static char REGEXP_WILDCARD = '.';
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
// Can only call from UnifiedGenotyper
if ( !(walker instanceof UnifiedGenotyper) ) {
if ( !walkerIdentityCheckWarningLogged ) {
if ( walker != null )
logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper, not " + walker.getClass().getName());
else
logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper");
walkerIdentityCheckWarningLogged = true;
}
return null;
}
if (vc.isSNP() && stratifiedContexts != null)
return annotatePileup(ref, stratifiedContexts, vc);
else
@ -108,7 +125,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc) {
if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here
if (stratifiedContexts.isEmpty()) // empty means that call was made by someone else and we have no data here
return null;
final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values());
@ -135,7 +152,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
}
// annotate the score in the info field
final Map<String, Object> map = new HashMap<String, Object>();
final Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean()));
return map;
}
@ -157,8 +174,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
int haplotypesToCompute = vc.getAlternateAlleles().size() + 1;
final PriorityQueue<Haplotype> candidateHaplotypeQueue = new PriorityQueue<Haplotype>(100, new HaplotypeComparator());
final PriorityQueue<Haplotype> consensusHaplotypeQueue = new PriorityQueue<Haplotype>(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator());
final PriorityQueue<Haplotype> candidateHaplotypeQueue = new PriorityQueue<>(100, new HaplotypeComparator());
final PriorityQueue<Haplotype> consensusHaplotypeQueue = new PriorityQueue<>(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator());
for (final PileupElement p : pileup) {
final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus);
@ -198,7 +215,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
// The consensus haplotypes are in a quality-ordered priority queue, so the best haplotypes are just the ones at the front of the queue
final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
List<Haplotype> hlist = new ArrayList<Haplotype>();
List<Haplotype> hlist = new ArrayList<>();
hlist.add(new Haplotype(haplotype1.getBases(), 60));
for (int k = 1; k < haplotypesToCompute; k++) {
@ -313,7 +330,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0));
if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1));
final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();
final ArrayList<double[]> haplotypeScores = new ArrayList<>();
for (final PileupElement p : pileup) {
// Score all the reads in the pileup, even the filtered ones
final double[] scores = new double[haplotypes.size()];
@ -394,12 +411,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
return mismatches - expected;
}
@Override
public List<String> getKeyNames() {
return Arrays.asList("HaplotypeScore");
return Arrays.asList(GATKVCFConstants.HAPLOTYPE_SCORE_KEY);
}
@Override
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes"));
return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
}
private static class Haplotype {

View File

@ -52,25 +52,23 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.WorkInProgressAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.QualityUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
@ -83,17 +81,20 @@ import java.util.Map;
*
* <h3>Caveats</h3>
* <ul>
* <li>This annotation requires multiple samples and a valid pedigree file.</li>
* <li>This annotation requires multiple samples.</li>
* <li>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</li>
* <li>Low confidence genotypes are ignored, which may adversely affect HW ratios. More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is heterozygous or homozygous variant.</li>
* </ul>
*/
public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation {
private final static Logger logger = Logger.getLogger(HardyWeinberg.class);
private static final int MIN_SAMPLES = 10;
private static final int MIN_GENOTYPE_QUALITY = 10;
private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
private boolean warningLogged = false;
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
@ -102,8 +103,13 @@ public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAn
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
final GenotypesContext genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) {
if ( !warningLogged ) {
logger.warn("Too few genotypes");
warningLogged = true;
}
return null;
}
int refCount = 0;
int hetCount = 0;
@ -132,12 +138,14 @@ public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAn
double pvalue = HardyWeinbergCalculation.hwCalculate(refCount, hetCount, homCount);
//System.out.println(refCount + " " + hetCount + " " + homCount + " " + pvalue);
Map<String, Object> map = new HashMap<String, Object>();
Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue)));
return map;
}
public List<String> getKeyNames() { return Arrays.asList("HW"); }
@Override
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.HARDY_WEINBERG_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HW", 1, VCFHeaderLineType.Float, "Phred-scaled p-value for Hardy-Weinberg violation")); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
}

View File

@ -51,17 +51,18 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.GenomeLoc;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.HashMap;
@ -103,14 +104,14 @@ public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalA
return null;
}
Map<String, Object> map = new HashMap<String, Object>();
Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%d", run));
return map;
}
public List<String> getKeyNames() { return Arrays.asList("HRun"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.HOMOPOLYMER_RUN_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HRun", 1, VCFHeaderLineType.Integer, "Largest Contiguous Homopolymer Run of Variant Allele In Either Direction")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
public boolean useZeroQualityReads() { return false; }

View File

@ -52,9 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
@ -62,11 +63,12 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnot
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MathUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
@ -80,15 +82,21 @@ import java.util.*;
* <p>The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
*
* <h3>Caveats</h3>
* <h4>Note that the Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.</h4>
* <ul>
* <li>The Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.</li>
* <li>This annotation is used in variant recalibration, but may not be appropriate for that purpose if the cohort being analyzed contains many closely related individuals.</li>
* <li>This annotation requires a valid pedigree file.</li>
* </ul>
*
*/
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(InbreedingCoeff.class);
private static final int MIN_SAMPLES = 10;
private static final String INBREEDING_COEFFICIENT_KEY_NAME = "InbreedingCoeff";
private Set<String> founderIds;
private int sampleCount;
private boolean pedigreeCheckWarningLogged = false;
private boolean didUniquifiedSampleNameCheck = false;
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
@ -98,9 +106,24 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
//If available, get the founder IDs and cache them. the IC will only be computed on founders then.
if(founderIds == null && walker != null)
founderIds = ((Walker)walker).getSampleDB().getFounderIds();
return makeCoeffAnnotation(vc);
if(founderIds == null && walker != null) {
founderIds = ((Walker) walker).getSampleDB().getFounderIds();
}
//if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders
if (!didUniquifiedSampleNameCheck) {
checkSampleNames(vc);
didUniquifiedSampleNameCheck = true;
}
if ( founderIds == null || founderIds.isEmpty() ) {
if ( !pedigreeCheckWarningLogged ) {
logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line.");
pedigreeCheckWarningLogged = true;
}
return null;
}
else{
return makeCoeffAnnotation(vc);
}
}
protected double calculateIC(final VariantContext vc, final GenotypesContext genotypes) {
@ -124,7 +147,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
{
if (g.isHetNonRef()) {
//all likelihoods go to homCount
homCount += 1;
homCount++;
continue;
}
@ -168,9 +191,25 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F));
}
@Override
public List<String> getKeyNames() { return Collections.singletonList(INBREEDING_COEFFICIENT_KEY_NAME); }
//this method is intended to reconcile uniquified sample names
// it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds
// is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different)
// sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated.
protected void checkSampleNames(final VariantContext vc) {
Set<String> vcSamples = new HashSet<>();
vcSamples.addAll(vc.getSampleNames());
if (!vcSamples.isEmpty()) {
if (founderIds!=null) {
vcSamples.removeAll(founderIds);
if (vcSamples.equals(vc.getSampleNames()))
founderIds = vc.getSampleNames();
}
}
}
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Collections.singletonList(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY_NAME, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); }
public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
}

View File

@ -53,14 +53,15 @@ package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.List;
/**
* Rank Sum Test of per-read likelihoods of REF vs. ALT reads
* Rank Sum Test of per-read likelihoods of REF versus ALT reads
*
* <p>This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.</p>
*
@ -73,10 +74,10 @@ import java.util.List;
*/
public class LikelihoodRankSumTest extends RankSumTest {
@Override
public List<String> getKeyNames() { return Arrays.asList("LikelihoodRankSum"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.LIKELIHOOD_RANK_SUM_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("LikelihoodRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
@Override
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) {

View File

@ -51,27 +51,28 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.samples.Trio;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MendelianViolation;
import htsjdk.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.gatk.engine.samples.MendelianViolation;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Likelihood of being a Mendelian Violation
*
* <p>This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible <em><de novo/em> mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.</p>
* <p>This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible <em><de novo</em> mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.</p>
*
* <h3>Statistical notes</h3>
* <p>This annotation considers all possible combinations of all possible genotypes (homozygous-reference, heterozygous, and homozygous-variant) for each member of a trio, which amounts to 27 possible combinations. Using the Phred-scaled genotype likelihoods (PL values) from each individual, the likelihood of each combination is calculated, and the result contributes to the likelihood of the corresponding case (mendelian violation or non-violation) depending on which set it belongs to. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
@ -81,7 +82,7 @@ import java.util.*;
* <li>The calculation assumes that the organism is diploid.</li>
* <li>This annotation requires a valid pedigree file.</li>
* <li>When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.</li>
* <li>This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li>
* <li>This annotation can only be used from the VariantAnnotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li>
* </ul>
*
* <h3>Related annotations</h3>
@ -93,9 +94,11 @@ import java.util.*;
public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation {
private final static Logger logger = Logger.getLogger(MVLikelihoodRatio.class);
private MendelianViolation mendelianViolation = null;
public static final String MVLR_KEY = "MVLR";
private Set<Trio> trios;
private boolean walkerIdentityCheckWarningLogged = false;
private boolean pedigreeCheckWarningLogged = false;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -103,17 +106,33 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiri
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( mendelianViolation == null ) {
trios = ((Walker) walker).getSampleDB().getTrios();
if ( trios.size() > 0 ) {
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
else {
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line.");
// Can only be called from VariantAnnotator
if ( !(walker instanceof VariantAnnotator) ) {
if ( !walkerIdentityCheckWarningLogged ) {
if ( walker != null )
logger.warn("Annotation will not be calculated, must be called from VariantAnnotator, not " + walker.getClass().getName());
else
logger.warn("Annotation will not be calculated, must be called from VariantAnnotator");
walkerIdentityCheckWarningLogged = true;
}
return null;
}
Map<String,Object> attributeMap = new HashMap<String,Object>(1);
if ( mendelianViolation == null ) {
// Must have a pedigree file
trios = ((Walker) walker).getSampleDB().getTrios();
if ( trios.isEmpty() ) {
if ( !pedigreeCheckWarningLogged ) {
logger.warn("Annotation will not be calculated, mendelian violation annotation must provide a valid PED file (-ped) from the command line.");
pedigreeCheckWarningLogged = true;
}
return null;
}
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
Map<String,Object> attributeMap = new HashMap<>(1);
//double pNoMV = 1.0;
double maxMVLR = Double.MIN_VALUE;
for ( Trio trio : trios ) {
@ -127,15 +146,16 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiri
//double pSomeMV = 1.0-pNoMV;
//toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV));
if ( Double.compare(maxMVLR,Double.MIN_VALUE) != 0 )
attributeMap.put(MVLR_KEY,maxMVLR);
attributeMap.put(getKeyNames().get(0), maxMVLR);
return attributeMap;
}
// return the descriptions used for the VCF INFO meta field
public List<String> getKeyNames() { return Arrays.asList(MVLR_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
@Override
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.MENDEL_VIOLATION_LR_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) {
for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) {

View File

@ -53,15 +53,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Rank Sum Test for mapping qualities of REF vs. ALT reads
* Rank Sum Test for mapping qualities of REF versus ALT reads
*
* <p>This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.</p>
* <p>This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
@ -80,10 +81,10 @@ import java.util.*;
*/
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {
@Override
public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.MAP_QUAL_RANK_SUM_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
@Override
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {

View File

@ -51,13 +51,13 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardUGAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
@ -78,6 +78,9 @@ import java.util.Map;
*
* <p>This anotation gives you the count of all reads that have MAPQ = 0 across all samples. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.</p>
*
* <h3>Caveat</h3>
* <p>It is not useful to apply this annotation with HaplotypeCaller because HC filters out all reads with MQ0 upfront, so the annotation will always return a value of 0.</p>
*
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityZeroBySample.php">MappingQualityZeroBySample</a></b> gives the count of reads with MAPQ=0 for each individual sample.</li>
@ -85,7 +88,7 @@ import java.util.Map;
* </ul>
*
*/
public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public class MappingQualityZero extends InfoFieldAnnotation implements StandardUGAnnotation, ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -104,7 +107,7 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA
private Map<String, Object> annotatePileup(final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
if ( stratifiedContexts.isEmpty() )
return null;
int mq0 = 0;
@ -123,7 +126,7 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA
private Map<String, Object> annotateWithLikelihoods(final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap,
final VariantContext vc) {
if (stratifiedPerReadAlleleLikelihoodMap == null)
if ( stratifiedPerReadAlleleLikelihoodMap == null )
return null;
int mq0 = 0;

View File

@ -51,22 +51,21 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.samples.Sample;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.samples.Trio;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MendelianViolation;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.samples.MendelianViolation;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import java.util.*;
@ -82,7 +81,7 @@ import java.util.*;
* <li>Only reports possible de novos for children whose genotypes have not been tagged as filtered (which is most appropriate if parent likelihoods
* have already been factored in using PhaseByTransmission).</li>
* <li>When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.</li>
* <li>This annotation can only be used from the Variant Annotator.If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li>
* <li>This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li>
* </ul>
*
* <h3>Related annotations</h3>
@ -94,14 +93,16 @@ import java.util.*;
public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringAnnotation, ExperimentalAnnotation {
private final static Logger logger = Logger.getLogger(PossibleDeNovo.class);
private MendelianViolation mendelianViolation = null;
public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo";
public static final String LO_CONF_DENOVO_KEY = "loConfDeNovo";
private final int hi_GQ_threshold = 20;
private final int lo_GQ_threshold = 10;
private final int hi_GQ_threshold = 20; //WARNING - If you change this value, update the description in GATKVCFHeaderLines
private final int lo_GQ_threshold = 10; //WARNING - If you change this value, update the description in GATKVCFHeaderLines
private final double percentOfSamplesCutoff = 0.001; //for many, many samples use 0.1% of samples as allele frequency threshold for de novos
private final int flatNumberOfSamplesCutoff = 4;
private Set<Trio> trios;
private boolean walkerIdentityCheckWarningLogged = false;
private boolean pedigreeCheckWarningLogged = false;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -109,21 +110,35 @@ public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringA
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( mendelianViolation == null ) {
trios = ((Walker) walker).getSampleDB().getTrios();
if ( trios.size() > 0 ) {
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
else {
throw new UserException("Possible de novos annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line.");
if ( !(walker instanceof VariantAnnotator ) ) {
if ( !walkerIdentityCheckWarningLogged ) {
if ( walker != null )
logger.warn("Annotation will not be calculated, must be called from VariantAnnotator, not " + walker.getClass().getName());
else
logger.warn("Annotation will not be calculated, must be called from VariantAnnotator");
walkerIdentityCheckWarningLogged = true;
}
return null;
}
final Map<String,Object> attributeMap = new HashMap<String,Object>(1);
if ( mendelianViolation == null ) {
trios = ((Walker) walker).getSampleDB().getTrios();
if ( trios.isEmpty() ) {
if ( !pedigreeCheckWarningLogged ) {
logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line.");
pedigreeCheckWarningLogged = true;
}
return null;
}
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
final Map<String,Object> attributeMap = new HashMap<>(1);
boolean isHighConfDeNovo = false;
boolean isLowConfDeNovo = false;
final List<String> highConfDeNovoChildren = new ArrayList<String>();
final List<String> lowConfDeNovoChildren = new ArrayList<String>();
final List<String> highConfDeNovoChildren = new ArrayList<>();
final List<String> lowConfDeNovoChildren = new ArrayList<>();
for ( final Trio trio : trios ) {
if (vc.isBiallelic() && contextHasTrioLikelihoods(vc,trio) && mendelianViolation.isViolation(trio.getMother(),trio.getFather(),trio.getChild(),vc) )
{
@ -146,18 +161,15 @@ public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringA
final double AFcutoff = Math.max(flatNumberOfSamplesCutoff,percentNumberOfSamplesCutoff);
final int deNovoAlleleCount = vc.getCalledChrCount(vc.getAlternateAllele(0)); //we assume we're biallelic above so use the first alt
if ( isHighConfDeNovo && deNovoAlleleCount < AFcutoff )
attributeMap.put(HI_CONF_DENOVO_KEY,highConfDeNovoChildren);
attributeMap.put(GATKVCFConstants.HI_CONF_DENOVO_KEY,highConfDeNovoChildren);
if ( isLowConfDeNovo && deNovoAlleleCount < AFcutoff )
attributeMap.put(LO_CONF_DENOVO_KEY,lowConfDeNovoChildren);
attributeMap.put(GATKVCFConstants.LO_CONF_DENOVO_KEY,lowConfDeNovoChildren);
return attributeMap;
}
// return the descriptions used for the VCF INFO meta field
public List<String> getKeyNames() { return Arrays.asList(HI_CONF_DENOVO_KEY,LO_CONF_DENOVO_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(HI_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "High confidence possible de novo mutation (GQ >= "+hi_GQ_threshold+" for all trio members)=[comma-delimited list of child samples]"),
new VCFInfoHeaderLine(LO_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "Low confidence possible de novo mutation (GQ >= "+lo_GQ_threshold+" for child, GQ > 0 for parents)=[comma-delimited list of child samples]")); }
@Override
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.HI_CONF_DENOVO_KEY, GATKVCFConstants.LO_CONF_DENOVO_KEY); }
private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) {
for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) {

View File

@ -51,18 +51,20 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
@ -78,7 +80,7 @@ import java.util.*;
* <h3>Statistical notes</h3>
* <p>The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.</p>
*
* <h3>Caveats</h3>
* <h3>Caveat</h3>
* <p>This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.</p>
*
* <h3>Related annotations</h3>
@ -151,9 +153,9 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
return null;
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null));
// Hack: UnifiedGenotyper (but not HaplotypeCaller or GenotypeGVCFs) over-estimates the quality of long indels
// Penalize the QD calculation for UG indels to compensate for this
double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, walker instanceof UnifiedGenotyper));
// Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);
@ -189,7 +191,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
if ( QD < MAX_QD_BEFORE_FIXING ) {
return QD;
} else {
return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
}
}
@ -197,10 +199,10 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
private final static double IDEAL_HIGH_QD = 30;
private final static double JITTER_SIGMA = 3;
public List<String> getKeyNames() { return Arrays.asList("QD"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.QUAL_BY_DEPTH_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth"));
return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
}

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;

View File

@ -52,9 +52,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;

View File

@ -57,17 +57,18 @@ import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Rank Sum Test for relative positioning of REF vs. ALT alleles within reads
* Rank Sum Test for relative positioning of REF versus ALT alleles within reads
*
* <p>This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.</p>
*
@ -85,11 +86,11 @@ import java.util.*;
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
@Override
public List<String> getKeyNames() { return Arrays.asList("ReadPosRankSum"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.READ_POS_RANK_SUM_KEY); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"));
return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
}
@Override

View File

@ -51,17 +51,17 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.HashMap;
@ -69,9 +69,9 @@ import java.util.List;
import java.util.Map;
/**
* List of samples that are polymorphic at a given site
* List samples that are non-reference at a given site
*
* <p>The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are polymorphic and compare them to samples that are homozygous-reference.</p>
* <p>The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are non-reference (heterozygous or homozygous-variant) and compare them to samples that are homozygous-reference.</p>
*/
public class SampleList extends InfoFieldAnnotation {
@ -84,7 +84,7 @@ public class SampleList extends InfoFieldAnnotation {
if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() )
return null;
StringBuffer samples = new StringBuffer();
final StringBuilder samples = new StringBuilder();
for ( Genotype genotype : vc.getGenotypesOrderedByName() ) {
if ( genotype.isCalled() && !genotype.isHomRef() ){
if ( samples.length() > 0 )
@ -97,11 +97,11 @@ public class SampleList extends InfoFieldAnnotation {
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put("Samples", samples.toString());
map.put(getKeyNames().get(0), samples.toString());
return map;
}
public List<String> getKeyNames() { return Arrays.asList("Samples"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.SAMPLE_LIST_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Samples", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "List of polymorphic samples")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
}

View File

@ -51,22 +51,22 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardUGAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
@ -76,20 +76,36 @@ import java.util.Map;
*
* <h3>Caveats</h3>
* <ul>
* <li>This annotation is not compatible with HaplotypeCaller; its purpose is to compensate for the UnifiedGenotyper's inability to integrate SNPs and indels in the same model (unlike HaplotypeCaller)</li>
* <li>In its current form, this annotation is not compatible with HaplotypeCaller. It is only meant to be used with UnifiedGenotyper, as its purpose is to compensate for the UnifiedGenotyper's inability to integrate SNPs and indels in the same model (unlike HaplotypeCaller).</li>
* <li>By default, the UnifiedGenotyper will not call variants where the fraction of spanning deletions is above a certain threshold. This threshold can be adjusted using the `--max_deletion_fraction` argument.</li>
* </ul>
*
*/
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
public class SpanningDeletions extends InfoFieldAnnotation implements StandardUGAnnotation {
private final static Logger logger = Logger.getLogger(SpanningDeletions.class);
private boolean walkerIdentityCheckWarningLogged = false;
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( stratifiedContexts.size() == 0 )
// Can only call from UnifiedGenotyper
if ( !(walker instanceof UnifiedGenotyper) ) {
if ( !walkerIdentityCheckWarningLogged ) {
if ( walker != null )
logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper, not " + walker.getClass().getName());
else
logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper");
walkerIdentityCheckWarningLogged = true;
}
return null;
}
if ( stratifiedContexts.isEmpty() )
return null;
// not meaningful when we're at an indel location: deletions that start at location N are by definition called at the position N-1, and at position N-1
@ -106,12 +122,14 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
deletions++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));
return map;
}
public List<String> getKeyNames() { return Arrays.asList("Dels"); }
@Override
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.SPANNING_DELETIONS_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Dels", 1, VCFHeaderLineType.Float, "Fraction of Reads Containing Spanning Deletions")); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
}

View File

@ -0,0 +1,169 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* Number of forward and reverse reads that support each allele
*
* <p>The StrandAlleleCountsBySample annotation produces read counts per allele (including (REF) and per strand. Note that, as with the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">AD</a> annotation, the allele counts here should not be used to make assumptions about the called genotype.</p>
*
* <p>This annotation produces 2 values per allele at each site, corresponding to the number of reads that support the following (in that order):</p>
* <ul>
* <li>the reference allele on the forward strand</li>
* <li>the reference allele on the reverse strand</li>
* <li>the first alternate allele on the forward strand</li>
* <li>the first alternate allele on the reverse strand</li>
* <li>the second alternate allele on the forward strand</li>
* <li><i>...etc</i></li>
* </ul>
*
* <h3>Example</h3>
* <pre>GT:AD:GQ:PL:SB:SAC 1/2:1,18,12:99:1022,326,382,537,0,487:1,0,4,8:1,0,3,15,4,8</pre>
* <p>In this example, the reference allele is supported by 1 read on the forward strand, the first alternate allele is supported by 3 forward and 15 reverse reads, and the second alternate allele is supported by 4 forward and 8 reverse reads.</p>
*
* <h3>Caveats</h3>
* <ul>
* <li>This annotation can only be generated by HaplotypeCaller (it will not work when called from VariantAnnotator).</li>
* </ul>
*
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> displays the number of reads supporting each allele, without stratifying by strand.</li>
* </ul>
*/
public class StrandAlleleCountsBySample extends GenotypeAnnotation {
private final static Logger logger = Logger.getLogger(StrandAlleleCountsBySample.class);
boolean[] warningsLogged = new boolean[4];
@Override
public void annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final AlignmentContext stratifiedContext,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( !AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger) ) {
return;
}
gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, getStrandCounts(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc));
}
@Override
public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); }
@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0)));
}
/**
* This method was inspired by (copied from) StrandBiasTest.getContingencyTable(). Unlike getContingencyTable, it
* returns values for all alleles rather than only reference and the most likely allele. Since this is not useful
* for StrandBias calculations, it's here and it skips the Nx2 table format used in that method
*/
private int[] getStrandCounts( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); }
if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); }
final int[] table = new int[vc.getNAlleles()*2];
for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
for (final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
final GATKSAMRecord read = el.getKey();
if (mostLikelyAllele.isInformative())
updateTable(table, vc.getAlleleIndex(mostLikelyAllele.getAlleleIfInformative()), read);
}
}
return table;
}
private void updateTable(final int[] table, final int alleleIndex, final GATKSAMRecord read) {
if (alleleIndex < 0 || (alleleIndex+1)*2 > table.length) return;
final int offset = alleleIndex * 2;
//Unstranded reads are not meaningful for this annotation, they can be found in the AD annotation
if (!read.isStrandless()) {
final boolean isFW = !read.getReadNegativeStrandFlag();
table[offset + (isFW ? 0 : 1)]++;
}
}
}

View File

@ -51,9 +51,10 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
@ -61,14 +62,15 @@ import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Number of forward and reverse reads that support REF and ALT alleles
*
* <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandBiasBySample annotation is produces read counts per allele and per strand that are used by other annotation modules (FisherStrand and StrandOddsRatio) to estimate strand bias using statistical approaches.
* <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandBiasBySample annotation produces read counts per allele and per strand that are used by other annotation modules (FisherStrand and StrandOddsRatio) to estimate strand bias using statistical approaches.
*
* <p>This annotation produces 4 values, corresponding to the number of reads that support the following (in that order):</p>
* <ul>
@ -82,6 +84,11 @@ import java.util.*;
* <pre>GT:AD:GQ:PL:SB 0/1:53,51:99:1758,0,1835:23,30,33,18</pre>
* <p>In this example, the reference allele is supported by 23 forward reads and 30 reverse reads, the alternate allele is supported by 33 forward reads and 18 reverse reads.</p>
*
* <h3>Caveats</h3>
* <ul>
* <li>This annotation can only be generated by HaplotypeCaller (it will not work when called from VariantAnnotator).</li>
* </ul>
*
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_FisherStrand.php">FisherStrand</a></b> uses Fisher's Exact Test to evaluate strand bias.</li>
@ -91,8 +98,8 @@ import java.util.*;
public class StrandBiasBySample extends GenotypeAnnotation {
public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB";
private final static Logger logger = Logger.getLogger(StrandBiasBySample.class);
boolean[] warningsLogged = new boolean[4];
@Override
public void annotate(final RefMetaDataTracker tracker,
@ -103,21 +110,22 @@ public class StrandBiasBySample extends GenotypeAnnotation {
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( ! isAppropriateInput(alleleLikelihoodMap, g) )
if (!AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger)) {
return;
}
final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc, 0);
gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table));
gb.attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, FisherStrand.getContingencyArray(table));
}
@Override
public List<String> getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); }
public List<String> getKeyNames() {
return Collections.singletonList(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
}
@Override
public List<VCFFormatHeaderLine> getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); }
private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) {
return ! (map == null || g == null || !g.isCalled());
}
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0))); }
}

View File

@ -53,14 +53,14 @@ package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import htsjdk.variant.variantcontext.Genotype;
@ -70,6 +70,7 @@ import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import java.util.*;
@ -78,34 +79,33 @@ import java.util.*;
*/
public abstract class StrandBiasTest extends InfoFieldAnnotation {
private final static Logger logger = Logger.getLogger(StrandBiasTest.class);
private static boolean stratifiedPerReadAlleleLikelihoodMapWarningLogged = false;
private static boolean inputVariantContextWarningLogged = false;
private static boolean getTableFromSamplesWarningLogged = false;
private static boolean decodeSBBSWarningLogged = false;
protected static final int ARRAY_DIM = 2;
protected static final int ARRAY_SIZE = ARRAY_DIM * ARRAY_DIM;
@Override
public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set<VCFHeaderLine> headerLines) {
boolean hasSBBSannotation = false;
// Does the VCF header contain strand bias (SB) by sample annotation?
for ( final VCFHeaderLine line : headerLines) {
if ( line instanceof VCFFormatHeaderLine) {
final VCFFormatHeaderLine formatline = (VCFFormatHeaderLine)line;
if ( formatline.getID().equals(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) {
hasSBBSannotation = true;
break;
if ( formatline.getID().equals(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) ) {
logger.warn("StrandBiasBySample annotation exists in input VCF header. Attempting to use StrandBiasBySample " +
"values to calculate strand bias annotation values. If no sample has the SB genotype annotation, annotation may still fail.");
return;
}
}
}
if (hasSBBSannotation) {
logger.info("StrandBiasBySample annotation exists in input VCF header. Attempting to use StrandBiasBySample " +
"values to calculate strand bias annotation values. If no sample has the SB genotype annotation, annotation may still fail.");
return;
}
boolean hasReads = toolkit.getReadsDataSource().getReaderIDs().size() > 0;
if (hasReads) {
// Are there reads from a SAM/BAM file?
if (toolkit.getReadsDataSource().getReaderIDs().isEmpty())
logger.warn("No StrandBiasBySample annotation or read data was found. Strand bias annotations will not be output.");
else
logger.info("SAM/BAM data was found. Attempting to use read data to calculate strand bias annotations values.");
return;
}
logger.info(new String("No StrandBiasBySample annotation or read data was found. Strand bias annotations will not be output."));
}
@Override
@ -116,34 +116,37 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
final Map<String,AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
// do not process if not a variant
if ( !vc.isVariant() )
return null;
// if the genotype and strand bias are provided, calculate the annotation from the Genotype (GT) field
if ( vc.hasGenotypes() ) {
boolean hasSB = false;
for (final Genotype g : vc.getGenotypes()) {
if (g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME)) {
hasSB = true;
break;
if (g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) {
return calculateAnnotationFromGTfield(vc.getGenotypes());
}
}
if (hasSB)
return calculateAnnotationFromGTfield(vc.getGenotypes());
}
//stratifiedContexts can come come from VariantAnnotator, but will be size 0 if no reads were provided
if (vc.isSNP() && stratifiedContexts != null && stratifiedContexts.size() > 0) {
// if a the variant is a snp and has stratified contexts, calculate the annotation from the stratified contexts
//stratifiedContexts can come come from VariantAnnotator, but will be empty if no reads were provided
if (vc.isSNP() && stratifiedContexts != null && !stratifiedContexts.isEmpty()) {
return calculateAnnotationFromStratifiedContexts(stratifiedContexts, vc);
}
//stratifiedPerReadAllelelikelihoodMap can come from HaplotypeCaller call to VariantAnnotatorEngine
// calculate the annotation from the stratified per read likelihood map
// stratifiedPerReadAllelelikelihoodMap can come from HaplotypeCaller call to VariantAnnotatorEngine
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
return calculateAnnotationFromLikelihoodMap(stratifiedPerReadAlleleLikelihoodMap, vc);
}
else
// for non-snp variants, we need per-read likelihoods.
else {
// for non-snp variants, we need per-read likelihoods.
// for snps, we can get same result from simple pileup
// for indels that do not have a computed strand bias (SB) or strand bias by sample (SBBS)
return null;
}
}
protected abstract Map<String, Object> calculateAnnotationFromGTfield(final GenotypesContext genotypes);
@ -162,17 +165,23 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
* @return the table used for several strand bias tests, will be null if none of the genotypes contain the per-sample SB annotation
*/
protected int[][] getTableFromSamples( final GenotypesContext genotypes, final int minCount ) {
if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); }
if( genotypes == null ) {
if ( !getTableFromSamplesWarningLogged ) {
logger.warn("Genotypes cannot be null.");
getTableFromSamplesWarningLogged = true;
}
return null;
}
final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse
boolean foundData = false;
for( final Genotype g : genotypes ) {
if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) )
if( g.isNoCall() || ! g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) )
continue;
foundData = true;
final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME);
final String sbbsString = (String) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
final int[] data = encodeSBBS(sbbsString);
if ( passesMinimumThreshold(data, minCount) ) {
for( int index = 0; index < sbArray.length; index++ ) {
@ -193,13 +202,13 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
*/
protected static int[][] getSNPContingencyTable(final Map<String, AlignmentContext> stratifiedContexts,
final Allele ref,
final Allele alt,
final List<Allele> allAlts,
final int minQScoreToConsider,
final int minCount ) {
int[][] table = new int[2][2];
int[][] table = new int[ARRAY_DIM][ARRAY_DIM];
for (final Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
final int[] myTable = new int[4];
final int[] myTable = new int[ARRAY_SIZE];
for (final PileupElement p : sample.getValue().getBasePileup()) {
if ( ! isUsableBase(p) ) // ignore deletions and bad MQ
@ -208,11 +217,13 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
continue;
updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt);
updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, allAlts);
}
if ( passesMinimumThreshold( myTable, minCount ) )
if ( passesMinimumThreshold( myTable, minCount ) ) {
copyToMainTable(myTable, table);
}
}
return table;
@ -228,19 +239,32 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
public static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap,
final VariantContext vc,
final int minCount) {
if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); }
if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); }
if( stratifiedPerReadAlleleLikelihoodMap == null ) {
if ( !stratifiedPerReadAlleleLikelihoodMapWarningLogged ) {
logger.warn("stratifiedPerReadAlleleLikelihoodMap cannot be null");
stratifiedPerReadAlleleLikelihoodMapWarningLogged = true;
}
return null;
}
if( vc == null ) {
if ( !inputVariantContextWarningLogged ) {
logger.warn("input vc cannot be null");
inputVariantContextWarningLogged = true;
}
return null;
}
final Allele ref = vc.getReference();
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
final int[][] table = new int[2][2];
final List<Allele> allAlts = vc.getAlternateAlleles();
final int[][] table = new int[ARRAY_DIM][ARRAY_DIM];
for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
final int[] myTable = new int[4];
final int[] myTable = new int[ARRAY_SIZE];
for (final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
final GATKSAMRecord read = el.getKey();
updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt);
updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, allAlts);
}
if ( passesMinimumThreshold(myTable, minCount) )
copyToMainTable(myTable, table);
@ -277,13 +301,14 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE);
}
private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) {
private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final List<Allele> allAlts) {
final boolean matchesRef = allele.equals(ref, true);
final boolean matchesAlt = allele.equals(alt, true);
final boolean matchesAlt = allele.equals(allAlts.get(0), true);
final boolean matchesAnyAlt = allAlts.contains(allele);
if ( matchesRef || matchesAlt ) {
final int offset = matchesRef ? 0 : 2;
if ( matchesRef || matchesAnyAlt ) {
final int offset = matchesRef ? 0 : ARRAY_DIM;
if ( read.isStrandless() ) {
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
@ -303,7 +328,7 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
* Does this strand data array pass the minimum threshold for inclusion?
*
* @param data the array
* @minCount The minimum threshold of counts in the array
* @param minCount The minimum threshold of counts in the array
* @return true if it passes the minimum threshold, false otherwise
*/
protected static boolean passesMinimumThreshold(final int[] data, final int minCount) {
@ -317,9 +342,9 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
* @return the array used by the per-sample Strand Bias annotation
*/
private static int[] encodeSBBS( final String string ) {
final int[] array = new int[4];
final int[] array = new int[ARRAY_SIZE];
final StringTokenizer tokenizer = new StringTokenizer(string, ",", false);
for( int index = 0; index < 4; index++ ) {
for( int index = 0; index < ARRAY_SIZE; index++ ) {
array[index] = Integer.parseInt(tokenizer.nextToken());
}
return array;
@ -331,8 +356,14 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
* @return the table used by the StrandOddsRatio annotation
*/
private static int[][] decodeSBBS( final int[] array ) {
if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); }
final int[][] table = new int[2][2];
if(array.length != ARRAY_SIZE) {
if ( !decodeSBBSWarningLogged ) {
logger.warn("Expecting a length = " + ARRAY_SIZE + " strand bias array.");
decodeSBBSWarningLogged = true;
}
return null;
}
final int[][] table = new int[ARRAY_DIM][ARRAY_DIM];
table[0][0] = array[0];
table[0][1] = array[1];
table[1][0] = array[2];

View File

@ -51,18 +51,15 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
@ -72,11 +69,11 @@ import java.util.*;
* <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.</p>
*
* <h3>Statistical notes</h3>
* <p> Odds Ratios in the 2x2 contingency table below are
* <p> Odds Ratios in the 2x2 contingency table below are</p>
*
* $$ R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]} $$
*
* and its inverse:
* <p>and its inverse:</p>
*
* <table>
* <tr><td>&nbsp;</td><td>+ strand </td><td>- strand</td></tr>
@ -84,12 +81,16 @@ import java.util.*;
* <tr><td>ALT;</td><td>X[1][0]</td><td>X[1][1]</td></tr>
* </table>
*
* The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where
* <p>The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where</p>
*
* $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$
* and
*
* <p>and </p>
*
* $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$
* ensures that the annotation value is large only.
* </p>
*
* <p>ensures that the annotation value is large only. </p>
*
* <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
*
* <h3>Related annotations</h3>
@ -103,8 +104,6 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
private final static double AUGMENTATION_CONSTANT = 1.0;
private static final int MIN_COUNT = 0;
private static final String SOR = "SOR";
@Override
protected Map<String, Object> calculateAnnotationFromGTfield(GenotypesContext genotypes){
final int[][] tableFromPerSampleAnnotations = getTableFromSamples( genotypes, MIN_COUNT );
@ -118,7 +117,7 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
@Override
protected Map<String, Object> calculateAnnotationFromStratifiedContexts(Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc){
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1, MIN_COUNT);
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT);
final double ratio = calculateSOR(tableNoFiltering);
return annotationForOneTable(ratio);
}
@ -166,9 +165,9 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
* @return the augmented table
*/
private static double[][] augmentContingencyTable(final int[][] table) {
double[][] augmentedTable = new double[2][2];
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2; j++ )
double[][] augmentedTable = new double[ARRAY_DIM][ARRAY_DIM];
for ( int i = 0; i < ARRAY_DIM; i++ ) {
for ( int j = 0; j < ARRAY_DIM; j++ )
augmentedTable[i][j] = table[i][j] + AUGMENTATION_CONSTANT;
}
@ -183,16 +182,16 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
*/
protected Map<String, Object> annotationForOneTable(final double ratio) {
final Object value = String.format("%.3f", ratio);
return Collections.singletonMap(SOR, value);
return Collections.singletonMap(getKeyNames().get(0), value);
}
@Override
public List<VCFInfoHeaderLine> getDescriptions() {
return Collections.singletonList(new VCFInfoHeaderLine(SOR, 1, VCFHeaderLineType.Float, "Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"));
return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
}
@Override
public List<String> getKeyNames() {
return Collections.singletonList(SOR);
return Collections.singletonList(GATKVCFConstants.STRAND_ODDS_RATIO_KEY);
}
}

View File

@ -51,24 +51,23 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardUGAnnotation;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.gatk.utils.collections.Pair;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* Tandem repeat unit composition and counts per allele
@ -77,50 +76,48 @@ import java.util.Map;
*
* <p>A tandem repeat unit is composed of one or more nucleotides that are repeated multiple times in series. Repetitive sequences are difficult to map to the reference because they are associated with multiple alignment possibilities. Knowing the number of repeat units in a set of tandem repeats tells you the number of different positions the tandem repeat can be placed in. The observation of many tandem repeat units multiplies the number of possible representations that can be made of the region.
*
* <h3>Caveats</h3>
* <h3>Caveat</h3>
* <ul>
* <li>This annotation is currently not compatible with HaplotypeCaller.</li>
* </ul>
*
*/
public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation {
private static final String STR_PRESENT = "STR";
private static final String REPEAT_UNIT_KEY = "RU";
private static final String REPEATS_PER_ALLELE_KEY = "RPA";
public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardUGAnnotation, ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(TandemRepeatAnnotator.class);
private boolean walkerIdentityCheckWarningLogged = false;
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) throws UserException {
if ( !vc.isIndel())
return null;
Pair<List<Integer>,byte[]> result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases());
final Pair<List<Integer>,byte[]> result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases());
if (result == null)
return null;
byte[] repeatUnit = result.second;
List<Integer> numUnits = result.first;
final byte[] repeatUnit = result.second;
final List<Integer> numUnits = result.first;
Map<String, Object> map = new HashMap<String, Object>();
map.put(STR_PRESENT,true);
map.put(REPEAT_UNIT_KEY,new String(repeatUnit));
map.put(REPEATS_PER_ALLELE_KEY, numUnits);
final Map<String, Object> map = new HashMap<>();
map.put(GATKVCFConstants.STR_PRESENT_KEY, true);
map.put(GATKVCFConstants.REPEAT_UNIT_KEY, new String(repeatUnit));
map.put(GATKVCFConstants.REPEATS_PER_ALLELE_KEY, numUnits);
return map;
}
protected static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY };
protected static final VCFInfoHeaderLine[] descriptions = {
new VCFInfoHeaderLine(STR_PRESENT, 0, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"),
new VCFInfoHeaderLine(REPEATS_PER_ALLELE_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };
@Override
public List<String> getKeyNames() {
return Arrays.asList(keyNames);
return Arrays.asList(
GATKVCFConstants.STR_PRESENT_KEY,
GATKVCFConstants.REPEAT_UNIT_KEY,
GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
}
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(descriptions); }
}

View File

@ -51,21 +51,20 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.samples.Sample;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MathUtils;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
@ -77,8 +76,6 @@ import java.util.*;
* <h3>Statistical notes</h3>
* <p>The calculation is based on the derivation described in <a href="http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT">http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT</a>.</p>
*
* <p>Note that this annotation requires a valid ped file.</p>
*
* <h3>Caveat</h3>
* <ul>
* <li>This annotation requires a valid pedigree file.</li>
@ -88,26 +85,46 @@ import java.util.*;
*/
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation {
private final static Logger logger = Logger.getLogger(TransmissionDisequilibriumTest.class);
private Set<Sample> trios = null;
private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information
private boolean walkerIdentityCheckWarningLogged = false;
private boolean pedigreeCheckWarningLogged = false;
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap){
// Can only be called from VariantAnnotator
if ( !(walker instanceof VariantAnnotator) ) {
if ( !walkerIdentityCheckWarningLogged ) {
if ( walker != null )
logger.warn("Annotation will not be calculated, must be called from VariantAnnotator, not " + walker.getClass().getName());
else
logger.warn("Annotation will not be calculated, must be called from VariantAnnotator");
walkerIdentityCheckWarningLogged = true;
}
return null;
}
// Get trios from the input pedigree file.
if ( trios == null ) {
if ( walker instanceof VariantAnnotator ) {
trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents();
} else {
throw new UserException("Transmission disequilibrium test annotation can only be used from the Variant Annotator and requires a valid ped file be passed in.");
trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents();
if (trios == null || trios.isEmpty()) {
if ( !pedigreeCheckWarningLogged ) {
logger.warn("Transmission disequilibrium test annotation requires a valid ped file be passed in.");
pedigreeCheckWarningLogged = true;
}
return null;
}
}
final Map<String, Object> toRet = new HashMap<String, Object>(1);
final HashSet<Sample> triosToTest = new HashSet<Sample>();
final Map<String, Object> toRet = new HashMap<>(1);
final HashSet<Sample> triosToTest = new HashSet<>();
for( final Sample child : trios ) {
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
@ -126,14 +143,16 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
}
// return the descriptions used for the VCF INFO meta field
public List<String> getKeyNames() { return Arrays.asList("TDT"); }
@Override
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.TRANSMISSION_DISEQUILIBRIUM_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); }
@Override
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
private List<Double> calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
List<Double> pairwiseTDTs = new ArrayList<Double>(10);
List<Double> pairwiseTDTs = new ArrayList<>(10);
final int HomRefIndex = 0;
// for each pair of alleles, add the likelihoods

View File

@ -51,16 +51,17 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.IndelUtils;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
@ -100,13 +101,13 @@ public class VariantType extends InfoFieldAnnotation {
}
}
Map<String, Object> map = new HashMap<String, Object>();
Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%s", type));
return map;
}
public List<String> getKeyNames() { return Arrays.asList("VariantType"); }
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.VARIANT_TYPE_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("VariantType", 1, VCFHeaderLineType.String, "Variant type description")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); }
}

View File

@ -56,17 +56,17 @@ import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Input;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.recalibration.RecalUtils;
import org.broadinstitute.gatk.utils.recalibration.RecalibrationReport;
import org.broadinstitute.gatk.utils.recalibration.BaseRecalibration;
import org.broadinstitute.gatk.engine.recalibration.RecalUtils;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationReport;
import org.broadinstitute.gatk.engine.recalibration.BaseRecalibration;
import java.io.File;
import java.io.FileNotFoundException;
@ -76,28 +76,31 @@ import java.util.Map;
/**
* Tool to analyze and evaluate base recalibration tables.
* Create plots to visualize base recalibration results
*
* <p/>
* It generates plots to assess the quality of a recalibration run.
* This tool generates plots for visualizing the quality of a recalibration run.
* </p>
*
* <h3>Input</h3>
*
* The tool can take up to three different sets of recalibration tables.
* <p>The tool can take up to three different sets of recalibration tables.
* The resulting plots will be overlaid on top of each other to make
* comparisons easy.
* comparisons easy.</p>
*
* <br/>
* <table style="text-align: left">
* <thead>
* <tr><th>Set</th><th>Argument</th><th>Label</th><th>Color</th><th>Description</th></tr>
* </thead>
* <tbody>
* <tr><td>Original</td><td>-before</td><td>BEFORE</td><td style="color: #ff34b3">Maroon1</td>
* <tr><td>Original</td><td>-before</td><td>BEFORE</td><td style="color: #ff34b3">Pink</td>
* <td>First pass recalibration
* tables obtained from applying {@link BaseRecalibration}
* tables obtained from applying BaseRecalibration
* on the original alignment.</td></tr>
* <tr><td>Recalibrated</td><td>-after</td><td>AFTER</td><td style="color: #0000ff">Blue</td>
* <td>Second pass recalibration tables
* results from the application of {@link BaseRecalibration}
* results from the application of BaseRecalibration
* on the alignment recalibrated using the first pass tables</td></tr>
* <tr><td>Input</td><td>-BQSR</td><td>BQSR</td><td style="color: #000000">Black</td>
* <td>Any recalibration table without a specific role</td></tr>
@ -105,32 +108,19 @@ import java.util.Map;
* </table>
* <br/>
*
* You need to specify one set at least. Multiple sets need to have the same values for the following parameters:
* <p>You need to specify at least one set. Multiple sets need to have the same values for the following parameters:
* <br/></br>
* <i>covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode,
* solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality,
* insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform,
* quantizing_levels</i> and <i>binary_tag_name</i>
* </p>
*
* <h3>Output</h3>
*
* Currently this tool generates two outputs:
* <p>A pdf document with plots that show the quality of the recalibration, and an optional csv file that contains a table with all the data required to generate those plots.</p>
*
* <dl>
* <dt style="font-weight: normal">-plots <i>my-report.pdf</i></dt>
* <dd>A pdf document that encloses plots to assess the quality of the recalibration.</dd>
* <dt style="font-weight: normal">-csv <i>my-report.csv</i></dt>
* <dd>A csv file that contains a table with all the data required to generate those plots.</dd>
* </dl>
*
* You need to specify at least one of them.
*
* <h3>Other Arguments</h3>
*
* <h4>-ignoreLMT, --ignoreLastModificationTimes</h4>
*
* when set, no warning message will be displayed in the -before recalibration table file is older than the -after one.
*
* <h3>Examples</h3>
* <h3>Usage examples</h3>
*
*
* <h4>Plot a single recalibration table</h4>
@ -142,7 +132,7 @@ import java.util.Map;
* -plots BQSR.pdf
* </pre>
*
* <h4>Plot before (first pass) and after (second pass) recalibration table to compare them</h4>
* <h4>Plot before (first pass) and after (second pass) recalibration tables to compare them</h4>
*
* <pre>
* java -jar GenomeAnalysisTK.jar \
@ -157,8 +147,8 @@ import java.util.Map;
*
* <pre>
*
* # You can ignore the before/after semantics completely if you like (if you do add -ignoreLMT
* # to avoid a possible warning), but all tables should have been generated using the same parameters.
* # You can ignore the before/after semantics completely if you like (if you do, add -ignoreLMT
* # to avoid a possible warning), but all tables must have been generated using the same parameters.
*
* java -jar GenomeAnalysisTK.jar \
* -T AnalyzeCovariates \
@ -173,31 +163,29 @@ import java.util.Map;
* <h4>Full BQSR quality assessment pipeline</h4>
*
* <pre>
* # Generate the first pass recalibration table file.
* # Generate the first pass recalibration table file
* java -jar GenomeAnalysisTK.jar \
* -T BaseRecalibrator \
* -R myreference.fasta \
* -R reference.fasta \
* -I myinput.bam \
* -knownSites bundle/my-trusted-snps.vcf \ # optional but recommendable
* -knownSites bundle/my-trusted-indels.vcf \ # optional but recommendable
* ... other options
* -knownSites bundle/my-trusted-snps.vcf \ # optional but recommended
* -knownSites bundle/my-trusted-indels.vcf \ # optional but recommended
* -o firstpass.table
*
* # Generate the second pass recalibration table file.
* # Generate the second pass recalibration table file
* java -jar GenomeAnalysisTK.jar \
* -T BaseRecalibrator \
* -BQSR firstpass.table \
* -R myreference.fasta \
* -R reference.fasta \
* -I myinput.bam \
* -knownSites bundle/my-trusted-snps.vcf \
* -knownSites bundle/my-trusted-indels.vcf \
* ... other options \
* -BQSR firstpass.table \
* -o secondpass.table
*
* # Finally generate the plots and also keep a copy of the csv (optional).
* # Finally generate the plots and also keep a copy of the csv (optional)
* java -jar GenomeAnalysisTK.jar \
* -T AnalyzeCovariates \
* -R myrefernce.fasta \
* -R reference.fasta \
* -before firstpass.table \
* -after secondpass.table \
* -csv BQSR.csv \ # optional
@ -251,14 +239,14 @@ public final class AnalyzeCovariates extends RodWalker<AnalyzeCovariates.None,An
/**
* Convenience reference to the RECAL_BQSR_FILE argument value.
* <p/>
*
* This field value is resolved by {@link #initialize()}.
*/
protected File bqsrFile = null;
/**
* Checks inputs and argument values.
* <p/>
*
* Notice that this routine will not validate the content of files. It may have some minor side effects as
* the output of warning messages back to the user.
*
@ -370,7 +358,6 @@ public final class AnalyzeCovariates extends RodWalker<AnalyzeCovariates.None,An
/**
* Generates the plots using the external R script.
*
* <p/>
* If <code>plotsFile</code> is <code>null</code>, it does not perform any plotting.
*
* @param csvFile the intermediary csv file.
@ -453,9 +440,9 @@ public final class AnalyzeCovariates extends RodWalker<AnalyzeCovariates.None,An
/**
* Creates a map with all input recalibration files indexed by their "role".
* <p/>
*
* The key is the role and the value the corresponding report file.
* <p/>
*
* Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file)
*
* @return never <code>null</code>
@ -523,7 +510,7 @@ public final class AnalyzeCovariates extends RodWalker<AnalyzeCovariates.None,An
/**
* Returns the csv file to use.
* <p/>
*
* This is the the one specified by the user if any or a temporary file
* that will be deleted as soon as the VM exists by default.
*

View File

@ -1,139 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.bqsr;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.commandline.Gatherer;
import org.broadinstitute.gatk.engine.report.GATKReport;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.recalibration.RecalibrationReport;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* User: carneiro
* Date: 3/29/11
*/
public class BQSRGatherer extends Gatherer {
private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
private static final String MISSING_OUTPUT_FILE = "missing output file name";
private static final String MISSING_READ_GROUPS = "Missing read group(s)";
@Override
public void gather(final List<File> inputs, final File output) {
final PrintStream outputFile;
try {
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
final GATKReport report = gatherReport(inputs);
report.print(outputFile);
}
/**
* Gathers the input recalibration reports into a single report.
*
* @param inputs Input recalibration GATK reports
* @return gathered recalibration GATK report
*/
public static GATKReport gatherReport(final List<File> inputs) {
final SortedSet<String> allReadGroups = new TreeSet<String>();
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
// Get the read groups from each input report
for (final File input : inputs) {
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
inputReadGroups.put(input, readGroups);
allReadGroups.addAll(readGroups);
}
// Log the read groups that are missing from specific inputs
for (Map.Entry<File, Set<String>> entry: inputReadGroups.entrySet()) {
final File input = entry.getKey();
final Set<String> readGroups = entry.getValue();
if (allReadGroups.size() != readGroups.size()) {
// Since this is not completely unexpected, more than debug, but less than a proper warning.
logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
logger.info(" " + readGroup);
}
}
}
RecalibrationReport generalReport = null;
for (File input : inputs) {
final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
if( inputReport.isEmpty() ) { continue; }
if (generalReport == null)
generalReport = inputReport;
else
generalReport.combine(inputReport);
}
if (generalReport == null)
throw new ReviewedGATKException(EMPTY_INPUT_LIST);
generalReport.calculateQuantizedQualities();
return generalReport.createGATKReport();
}
}

View File

@ -55,15 +55,16 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.engine.recalibration.*;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.utils.commandline.Advanced;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.filters.*;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.baq.BAQ;
@ -74,7 +75,7 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.recalibration.*;
import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate;
import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
@ -85,49 +86,49 @@ import java.util.Arrays;
import java.util.List;
/**
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
* Generate base recalibration table to compensate for systematic errors
*
* <p>
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
* This tool is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
* only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
* of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
* reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
* of poor base quality. This tool generates tables based on various user-specified covariates (such as read group,
* reported quality score, cycle, and context). Since there is a large amount of data, one can then calculate an empirical
* probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
* The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
* </p>
* <p>
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
*
* <p>
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added regardless of whether
* or not they were specified.
* </p>
*
* <h3>Input</h3>
* <p>
* The input read data whose base quality scores need to be assessed.
* A BAM file containing data that needs to be recalibrated.
* <p>
* A database of known polymorphic sites to skip over.
* A database of known polymorphic sites to mask out.
* </p>
*
* <h3>Output</h3>
* <p>
* A GATK Report file with many tables:
* <ol>
* <p>A GATKReport file with many tables:</p>
* <ul>
* <li>The list of arguments</li>
* <li>The quantized qualities table</li>
* <li>The recalibration table by read group</li>
* <li>The recalibration table by quality score</li>
* <li>The recalibration table for all the optional covariates</li>
* </ol>
*
* The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
* </ul>
*<p>
* The GATKReport table format is intended to be easy to read by both humans and computer languages (especially R).
* Check out the documentation of the GATKReport (in the FAQs) to learn how to manipulate this table.
* </p>
*
* <h3>Examples</h3>
* <h3>Usage example</h3>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* java -jar GenomeAnalysisTK.jar \
* -T BaseRecalibrator \
* -R reference.fasta \
* -I my_reads.bam \
* -R resources/Homo_sapiens_assembly18.fasta \
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
* -knownSites another/optional/setOfSitesToMask.vcf \
* -knownSites latest_dbsnp.vcf \
* -o recal_data.table
* </pre>
*/
@ -138,16 +139,16 @@ import java.util.List;
@PartitionBy(PartitionType.READ)
public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSchedulable {
/**
* all the command line arguments for BQSR and it's covariates
* all the command line arguments for BQSR and its covariates
*/
@ArgumentCollection
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/**
* When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
* When you use nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
* purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag
* to safely access only one table. There may be some CPU cost, but as long as the table is really big
* there should be relatively little CPU costs.
* the cost should be relatively reasonable.
*/
@Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false)
public boolean lowMemoryMode = false;
@ -170,7 +171,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
private int minimumQToUse;
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
@ -312,6 +313,12 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
final boolean[] knownSites = new boolean[readLength];
Arrays.fill(knownSites, false);
for( final Feature f : features ) {
if ((f.getStart() < read.getSoftStart() && f.getEnd() < read.getSoftStart()) ||
(f.getStart() > read.getSoftEnd() && f.getEnd() > read.getSoftEnd())) {
// feature is outside clipping window for the read, ignore
continue;
}
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureStartOnRead = 0;

View File

@ -55,7 +55,7 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.recalibration.EventType;
import org.broadinstitute.gatk.utils.recalibration.ReadCovariates;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/**

View File

@ -1,420 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.bqsr;
import com.google.java.contract.Requires;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.report.GATKReportTable;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.recalibration.RecalUtils;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 27, 2009
*
* A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker.
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
*/
public class RecalibrationArgumentCollection implements Cloneable {
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out.
*/
@Gather(BQSRGatherer.class)
@Output(doc = "The output recalibration table file to create", required = true)
public File RECAL_TABLE_FILE = null;
public PrintStream RECAL_TABLE;
/**
* Note that the --list argument requires a fully resolved and correct command-line to work.
*/
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
public boolean LIST_ONLY = false;
/**
* Note that the ReadGroup and QualityScore covariates are required and do not need to be specified.
* Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default.
* Use the --list argument to see the available covariates.
*/
@Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false)
public String[] COVARIATES = null;
/*
* The Cycle and Context covariates are standard and are included by default unless this argument is provided.
* Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
*/
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Advanced
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
public boolean RUN_WITHOUT_DBSNP = false;
/**
* BaseRecalibrator accepts a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
* BaseRecalibrator accepts a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
/**
* The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false)
public int MISMATCHES_CONTEXT_SIZE = 2;
/**
* The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false)
public int INDELS_CONTEXT_SIZE = 3;
/**
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
* This argument is ignored if the Cycle covariate is not used.
*/
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false)
public int MAXIMUM_CYCLE_VALUE = 500;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off]
*/
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
public byte MISMATCHES_DEFAULT_QUALITY = -1;
/**
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on]
*/
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
public byte INSERTIONS_DEFAULT_QUALITY = 45;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on]
*/
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
/**
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
*/
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
public byte LOW_QUAL_TAIL = 2;
/**
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
* This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
*/
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
/**
* The tag name for the binary tag covariate (if using it)
*/
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
public String BINARY_TAG_NAME = null;
/*
* whether GATK report tables should have rows in sorted order, starting from leftmost column
*/
@Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false)
public Boolean SORT_BY_ALL_COLUMNS = false;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.")
public String FORCE_READGROUP = null;
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
/**
* The repeat covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
*/
@Hidden
@Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false)
public int MAX_STR_UNIT_LENGTH = 8;
@Hidden
@Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false)
public int MAX_REPEAT_LENGTH = 20;
public File existingRecalibrationReport = null;
public GATKReportTable generateReportTable(final String covariateNames) {
GATKReportTable argumentsTable;
if(SORT_BY_ALL_COLUMNS) {
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
} else {
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
}
argumentsTable.addColumn("Argument");
argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
argumentsTable.addRowID("covariate", true);
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
argumentsTable.addRowID("no_standard_covs", true);
argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
argumentsTable.addRowID("run_without_dbsnp", true);
argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.addRowID("solid_recal_mode", true);
argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
argumentsTable.addRowID("solid_nocall_strategy", true);
argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.addRowID("mismatches_context_size", true);
argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.addRowID("indels_context_size", true);
argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
argumentsTable.addRowID("mismatches_default_quality", true);
argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.addRowID("deletions_default_quality", true);
argumentsTable.set("deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY);
argumentsTable.addRowID("insertions_default_quality", true);
argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
argumentsTable.addRowID("maximum_cycle_value", true);
argumentsTable.set("maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE);
argumentsTable.addRowID("low_quality_tail", true);
argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
argumentsTable.addRowID("default_platform", true);
argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.addRowID("force_platform", true);
argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.addRowID("quantizing_levels", true);
argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.addRowID("recalibration_report", true);
argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
argumentsTable.addRowID("binary_tag_name", true);
argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
return argumentsTable;
}
/**
* Returns a map with the arguments that differ between this an
* another {@link RecalibrationArgumentCollection} instance.
* <p/>
* The key is the name of that argument in the report file. The value is a message
* that explains the difference to the end user.
* <p/>
* Thus, a empty map indicates that there is no differences between both argument collection that
* is relevant to report comparison.
* <p/>
* This method should not throw any exception.
*
* @param other the argument-collection to compare against.
* @param thisRole the name used to refer to this RAC report that makes sense to the end user.
* @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
*
* @return never <code>null</code>, but a zero-size collection if there are no differences.
*/
@Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)")
Map<String,? extends CharSequence> compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) {
final Map<String,String> result = new LinkedHashMap<>(15);
compareRequestedCovariates(result, other, thisRole, otherRole);
compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole);
compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole);
compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole);
compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole);
compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole);
compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole);
compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole);
compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole);
compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole);
compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole);
compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole);
compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole);
compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole);
compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole);
return result;
}
/**
* Compares the covariate report lists.
*
* @param diffs map where to annotate the difference.
* @param other the argument collection to compare against.
* @param thisRole the name for this argument collection that makes sense to the user.
* @param otherRole the name for the other argument collection that makes sense to the end user.
*
* @return <code>true</code> if a difference was found.
*/
@Requires("diffs != null && other != null && thisRole != null && otherRole != null")
private boolean compareRequestedCovariates(final Map<String,String> diffs,
final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) {
final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length);
final Set<String> afterNames = new HashSet<>(other.COVARIATES.length);
Utils.addAll(beforeNames, this.COVARIATES);
Utils.addAll(afterNames,other.COVARIATES);
final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size()));
intersect.addAll(beforeNames);
intersect.retainAll(afterNames);
String diffMessage = null;
if (intersect.size() == 0) { // In practice this is not possible due to required covariates but...
diffMessage = String.format("There are no common covariates between '%s' and '%s'"
+ " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole,
thisRole,Utils.join(", ",this.COVARIATES),
otherRole,Utils.join(",",other.COVARIATES));
} else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) {
beforeNames.removeAll(intersect);
afterNames.removeAll(intersect);
diffMessage = String.format("There are differences in the set of covariates requested in the"
+ " '%s' and '%s' recalibrator reports. "
+ " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole,
thisRole,Utils.join(", ",beforeNames),
otherRole,Utils.join(", ",afterNames));
}
if (diffMessage != null) {
diffs.put("covariate",diffMessage);
return true;
} else {
return false;
}
}
/**
* Annotates a map with any difference encountered in a simple value report argument that differs between this an
* another {@link RecalibrationArgumentCollection} instance.
* <p/>
* The key of the new entry would be the name of that argument in the report file. The value is a message
* that explains the difference to the end user.
* <p/>
*
* <p/>
* This method should not return any exception.
*
* @param diffs where to annotate the differences.
* @param name the name of the report argument to compare.
* @param thisValue this argument collection value for that argument.
* @param otherValue the other collection value for that argument.
* @param thisRole the name used to refer to this RAC report that makes sense to the end user.
* @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
*
* @type T the argument Object value type.
*
* @return <code>true</code> if a difference has been spotted, thus <code>diff</code> has been modified.
*/
private <T> boolean compareSimpleReportArgument(final Map<String,String> diffs,
final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) {
if (thisValue == null && otherValue == null) {
return false;
} else if (thisValue != null && thisValue.equals(otherValue)) {
return false;
} else {
diffs.put(name,
String.format("differences between '%s' {%s} and '%s' {%s}.",
thisRole,thisValue == null ? "" : thisValue,
otherRole,otherValue == null ? "" : otherValue));
return true;
}
}
/**
* Create a shallow copy of this argument collection.
*
* @return never <code>null</code>.
*/
@Override
public RecalibrationArgumentCollection clone() {
try {
return (RecalibrationArgumentCollection) super.clone();
} catch (CloneNotSupportedException e) {
throw new GATKException("Unreachable code clone not supported thrown when the class "
+ this.getClass().getName() + " is cloneable ",e);
}
}
}

View File

@ -52,9 +52,13 @@
package org.broadinstitute.gatk.tools.walkers.bqsr;
import com.google.java.contract.Requires;
import org.broadinstitute.gatk.engine.recalibration.ReadCovariates;
import org.broadinstitute.gatk.engine.recalibration.RecalDatum;
import org.broadinstitute.gatk.engine.recalibration.RecalUtils;
import org.broadinstitute.gatk.engine.recalibration.RecalibrationTables;
import org.broadinstitute.gatk.utils.collections.NestedIntegerArray;
import org.broadinstitute.gatk.utils.recalibration.*;
import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate;
import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.io.PrintStream;

View File

@ -54,10 +54,10 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.report.GATKReport;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.report.GATKReport;
import org.broadinstitute.gatk.engine.walkers.LocusWalker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -71,17 +71,16 @@ import java.util.LinkedList;
import java.util.Map;
/**
* Simple walker to plot the coverage distribution per base
* Evaluate coverage distribution per base
*
* <p>
* Features of this walker:
* <li>includes a smart counting of uncovered bases without visiting the uncovered loci</li>
* <li>includes reads with deletions in the loci (optionally can be turned off)</li>
* This tool reports the distribution of coverage per base. It includes reads with deletions in the counts unless
* otherwise specified. Quality filters can be applied before the coverage is calculated.
* </p>
*
* <h3>Input</h3>
* <p>
* The BAM file and an optional interval list (works for WGS as well)
* The BAM file and an optional interval list
* </p>
*
* <h3>Output</h3>
@ -89,13 +88,13 @@ import java.util.Map;
* A GATK Report with the coverage distribution per base
*
* <p/>
* <h3>Examples</h3>
* <h3>Usage example</h3>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \
* -T BaseCoverageDistribution \
* -I myData.bam \
* -L interesting.intervals \
* -L intervals.list \
* -fd \
* -o report.grp
* </pre>
@ -106,34 +105,34 @@ import java.util.Map;
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
public class BaseCoverageDistribution extends LocusWalker<ArrayList<Integer>, Map<Integer, ArrayList<Long>>> {
/**
* The output GATK Report table
* The name of the file to output the GATK Report table. See the FAQs for more information on the GATK Report format.
*/
@Output(doc = "The output GATK Report table")
@Output(doc = "Output filename")
private PrintStream out;
/**
* Whether or not a deletion should be counted towards the coverage of a site
*/
@Argument(required = false, shortName="del", fullName = "include_deletions", doc ="whether or not to include reads with deletions on the loci in the pileup")
@Argument(required = false, shortName="del", fullName = "include_deletions", doc ="Include reads with deletions")
private boolean includeDeletions = true;
/**
* Whether or not to calculate and output a filtered coverage distribution. Bases will be filtered according to the
* Whether or not to apply quality filters before calculating coverage distribution. Filtering will use the
* minimum_mapping_quality and minimum_base_quality parameters below.
*/
@Argument(required = false, shortName="fd", fullName = "filtered_distribution", doc ="calculate and report the filtered coverage distribution of bases")
@Argument(required = false, shortName="fd", fullName = "filtered_distribution", doc ="Apply quality filters")
private boolean calculateFilteredDistribution = false;
/**
* The minimum mapping quality a read must have to be counted towards the filtered coverage of a site
*/
@Argument(required = false, shortName="mmq", fullName = "minimum_mapping_quality", doc ="minimum mapping quality of a read to include it in the filtered coverage distribution")
@Argument(required = false, shortName="mmq", fullName = "minimum_mapping_quality", doc ="Minimum read mapping quality of a read to pass filters")
private byte minMappingQuality = 20;
/**
* The minimum base quality a base must have to be counted towards the filtered coverage of a site
*/
@Argument(required = false, shortName="mbq", fullName = "minimum_base_quality", doc ="minimum base quality of a base to include it in the filtered coverage distribution")
@Argument(required = false, shortName="mbq", fullName = "minimum_base_quality", doc ="Minimum base quality to pass filters")
private byte minBaseQuality = 17;
private GenomeLoc previousLocus = null;

View File

@ -54,9 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters;
import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker;
import org.broadinstitute.gatk.engine.walkers.PartitionBy;
@ -69,9 +69,10 @@ import org.broadinstitute.gatk.utils.help.HelpConstants;
import java.io.PrintStream;
/**
* Outputs a list of intervals that are covered above a given threshold.
* Outputs a list of intervals that are covered above a given threshold
*
* <p>The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.</p>
* <p>The output list can be used as an interval list for other tools. Note that if the -uncovered argument is given, the
* logic will be inverted and the tool will instead output intervals that fail the coverage threshold.</p>
*
* <h3>Input</h3>
* <p>
@ -85,16 +86,16 @@ import java.io.PrintStream;
*
* <h3>Example</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* java -jar GenomeAnalysisTK.jar \
* -T FindCoveredIntervals \
* -R ref.fasta \
* -R reference.fasta \
* -I my_file.bam \
* -o output.list
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.CONTIG)
@PartitionBy(value = PartitionType.CONTIG)
@ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000)
public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
@Output

View File

@ -56,12 +56,11 @@ import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.downsampling.DownsampleType;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.downsampling.DownsampleType;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.classloader.PluginManager;
import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
@ -70,49 +69,44 @@ import org.broadinstitute.gatk.utils.help.HelpConstants;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.io.PrintStream;
import java.util.*;
/**
* Analyzes coverage distribution and validates read mates for a given interval and sample.
* <p/>
* Analyze coverage distribution and validate read mates per interval and per sample
*
* <p>
* Used to diagnose regions with bad coverage, mapping, or read mating. Analyzes each sample independently in addition
* to interval wide analysis.
* This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It analyzes each sample
* independently and aggregates results over intervals of interest.
* </p>
* <p/>
* <p/>
* <h3>Input</h3>
* <p>
* <ul>
* <li>A reference file</li>
* <li>one or more input BAMs</li>
* <li>One or more intervals</li>
* </ul>
* </p>
* <p/>
* <h3>Output</h3>
* <p>
* A modified VCF detailing each interval by sample and information for each interval according to the thresholds used.
* Interval information includes GC Content, average interval depth, callable status among others.
*
* If you use the --missing option, you can get as a second output a intervals file with the loci that have missing data.
* Interval information includes GC Content, average interval depth, callable status among others. If you use the
* --missing option, you can get as a second output a intervals file with the loci that have missing data.
* This file can then be used as input to QualifyMissingIntervals for full qualification and interpretation of why
* the data is missing.
* </p>
* <p/>
* <h3>Examples</h3>
* <h3>Usage example</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* java -jar GenomeAnalysisTK.jar
* -T DiagnoseTargets \
* -R reference.fasta \
* -o output.vcf \
* -I sample1.bam \
* -I sample2.bam \
* -I sample3.bam \
* -L intervals.interval_list
* -L intervals.interval_list \
* -o output.vcf
* </pre>
*
* @author Mauricio Carneiro, Roger Zurawicki
@ -124,12 +118,6 @@ import java.util.*;
@Downsample(by = DownsampleType.NONE)
public class DiagnoseTargets extends LocusWalker<Long, Long> {
private static final String AVG_INTERVAL_DP_KEY = "IDP";
private static final String LOW_COVERAGE_LOCI = "LL";
private static final String ZERO_COVERAGE_LOCI = "ZL";
private static final String GC_CONTENT_KEY = "GC";
@Output(doc = "File to which interval statistics should be written")
private VariantContextWriter vcfWriter = null;
@ -150,11 +138,11 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");
intervalMap = new LinkedHashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE);
intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator());
// get all of the unique sample names for the VCF Header
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples));
// pre load all the statistics classes because it is costly to operate on the JVM and we only want to do it once.
@ -224,7 +212,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
*/
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
// output any intervals that were finished
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
final List<GenomeLoc> toRemove = new LinkedList<>();
for (GenomeLoc key : intervalMap.keySet()) {
if (key.isBefore(refLocus)) {
final IntervalStratification intervalStats = intervalMap.get(key);
@ -263,17 +251,17 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
GenomeLoc interval = stats.getInterval();
final List<Allele> alleles = new ArrayList<Allele>();
final Map<String, Object> attributes = new HashMap<String, Object>();
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
final List<Allele> alleles = new ArrayList<>();
final Map<String, Object> attributes = new HashMap<>();
final ArrayList<Genotype> genotypes = new ArrayList<>();
for (String sample : samples) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
SampleStratification sampleStat = stats.getSampleStatistics(sample);
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
gb.attribute(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size()));
gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
genotypes.add(gb.make());
@ -283,11 +271,11 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR);
vcb.filters(new LinkedHashSet<String>(statusToStrings(stats.callableStatuses(), true)));
vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true)));
attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
attributes.put(GC_CONTENT_KEY, stats.gcContent());
attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent());
vcb = vcb.attributes(attributes);
vcb = vcb.genotypes(genotypes);
@ -347,7 +335,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @return a matching set of strings
*/
private List<String> statusToStrings(Iterable<CallableStatus> statuses, final boolean isInfoField) {
List<String> output = new LinkedList<String>();
List<String> output = new LinkedList<>();
for (CallableStatus status : statuses)
if ( isInfoField || status != CallableStatus.PASS )
@ -398,19 +386,19 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @return A set of VCF header lines
*/
private static Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
Set<VCFHeaderLine> headerLines = new HashSet<>();
// INFO fields for overall data
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine(GC_CONTENT_KEY, 1, VCFHeaderLineType.Float, "GC Content of the interval"));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size."));
headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero."));
headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage."));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())

View File

@ -51,8 +51,8 @@
package org.broadinstitute.gatk.tools.walkers.diagnostics.diagnosetargets;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;

View File

@ -56,11 +56,11 @@ import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Gather;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.report.GATKReport;
import org.broadinstitute.gatk.engine.report.GATKReportGatherer;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.report.GATKReport;
import org.broadinstitute.gatk.utils.report.GATKReportGatherer;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
@ -74,9 +74,9 @@ import java.io.PrintStream;
import java.util.List;
/**
* Walks along reference and calculates a few metrics for each interval.
* Collect quality metrics for a set of intervals
*
* Metrics:
* <p>This tool collects the following metrics:</p>
* <ul>
* <li>Average Base Quality</li>
* <li>Average Mapping Quality</li>
@ -88,9 +88,11 @@ import java.util.List;
* <li>Length of the uncovered interval</li>
* </ul>
*
* <p>It is meant to be run on a set of intervals that have been identified as problematic in earlier stages of quality control and are considered "missing" from the sequence dataset.</p>
*
* <h3>Input</h3>
* <p>
* A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds)
* A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds).
* </p>
*
* <h3>Output</h3>
@ -98,11 +100,11 @@ import java.util.List;
* GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval.
* </p>
*
* <h3>Example</h3>
* <h3>Usage example</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* java -jar GenomeAnalysisTK.jar \
* -T QualifyMissingIntervals \
* -R ref.fasta \
* -R reference.fasta \
* -I input.bam \
* -o output.grp \
* -L input.intervals \

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;

View File

@ -52,11 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.gatk.utils.collections.Pair;

View File

@ -52,7 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import com.google.java.contract.Requires;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.MathUtils;

View File

@ -54,7 +54,6 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.samtools.SAMUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACcounts;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.gatk.utils.MathUtils;
@ -62,6 +61,7 @@ import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import java.util.*;
@ -319,7 +319,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
iterator.next();
}
if (VERBOSE) {
System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
System.out.println(GATKVCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
}
return new Pair<int[], Double>(mlInd,maxVal);
}

View File

@ -52,19 +52,19 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
@ -287,7 +287,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
final HashMap<String, Object> attributes = new HashMap<String, Object>();
if (UAC.referenceSampleName != null && perLaneErrorModels != null)
attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
attributes.put(GATKVCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
builder.attributes(attributes);
// create the genotypes; no-call everyone for now

View File

@ -51,7 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;

View File

@ -52,10 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;

View File

@ -77,10 +77,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;

View File

@ -52,10 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;

View File

@ -52,7 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.genotyper.SampleListUtils;
/**
* Encapsulates the data use to make the genotype calls.

View File

@ -54,15 +54,13 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult;
@ -74,6 +72,8 @@ import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
@ -85,10 +85,6 @@ import java.util.*;
*/
public abstract class GenotypingEngine<Config extends StandardCallerArgumentCollection> {
public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
protected final AFCalculatorProvider afCalculatorProvider ;
protected Logger logger;
@ -157,7 +153,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
public Set<VCFInfoHeaderLine> getAppropriateVCFInfoHeaders() {
Set<VCFInfoHeaderLine> headerInfo = new HashSet<>();
if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY));
return headerInfo;
}
@ -261,7 +257,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
//TODO and change the code below accordingly.
builder.log10PError(log10Confidence == 0.0 ? -0.0 : log10Confidence);
if ( ! passesCallThreshold(phredScaledConfidence) )
builder.filter(LOW_QUAL_FILTER_NAME);
builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME);
// create the genotypes
@ -570,10 +566,6 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
*/
private static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {
final double[] priors = new double[N + 1];
double sum = 0.0;
final AFPriorProvider result;
if (!inputPriors.isEmpty()) {
// user-specified priors
if (inputPriors.size() != N)
@ -651,17 +643,17 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
attributes.putAll(vc.getAttributes());
// if the site was down-sampled, record that fact
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
attributes.put(GATKVCFConstants.DOWNSAMPLED_KEY, true);
// add the MLE AC and AF annotations
if ( alleleCountsofMLE.size() > 0 ) {
attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
attributes.put(GATKVCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
final ArrayList<Double> MLEfrequencies = calculateMLEAlleleFrequencies(alleleCountsofMLE, genotypes);
attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
attributes.put(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
}
if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());
attributes.put(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());
return attributes;
@ -673,7 +665,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
for (final Allele a : g.getAlleles())
if (!a.isNoCall()) AN++;
final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size());
final ArrayList<Double> MLEfrequencies = new ArrayList<>(alleleCountsofMLE.size());
// the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1)
for (final int AC : alleleCountsofMLE )
MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN));

View File

@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import java.util.List;

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
/**
* Common interface for genotyping models.

View File

@ -51,6 +51,8 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
/**
* {@link PloidyModel} implementation tailored to work with a homogeneous constant ploidy
* across samples and positions.

View File

@ -52,10 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLoc;

View File

@ -53,6 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation;
import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import java.util.ArrayList;

View File

@ -51,6 +51,8 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
/**
* Information about the number of chromosome per sample at a given location.
*

View File

@ -52,10 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -69,6 +69,7 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
@ -202,7 +203,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
gb.DP(sampleData.depth);
gb.alleles(noCall);
if (UAC.annotateAllSitesWithPLs)
gb.attribute(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
gb.attribute(GATKVCFConstants.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
genotypes.add(gb.make());
}

View File

@ -0,0 +1,231 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (PHONE-HOME) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEES user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2014 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.collections.DefaultHashMap;
import htsjdk.variant.variantcontext.VariantContext;
import java.io.File;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.Collections;
import java.util.Map;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 8/20/12
* A collection of arguments that are common to the various callers.
* This is pulled out so that every caller isn't exposed to the arguments from every other caller.
*/
public class StandardCallerArgumentCollection implements Cloneable {
@ArgumentCollection
public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
/**
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
*/
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
public RodBinding<VariantContext> alleles;
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
* will try to remove (N * contamination fraction) bases for each alternate allele.
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
/**
* This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
* Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION.
**/
@Advanced
@Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"<SampleID><TAB><Contamination>\" (Contamination is double) per line; No header.", required = false)
public File CONTAMINATION_FRACTION_FILE = null;
/**
* Indicates whether there is some sample contamination present.
*/
private boolean sampleContaminationWasLoaded = false;
/**
*
* @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION
*/
public Map<String,Double> getSampleContamination(){
//make sure that the default value is set up right
sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0)
sampleContaminationWasLoaded = true;
return Collections.unmodifiableMap(sampleContamination);
}
public void setSampleContamination(DefaultHashMap<String, Double> sampleContamination) {
this.sampleContamination.clear();
this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0;
if (!sampleContaminationWasLoaded)
for (final Double d : sampleContamination.values())
if (!Double.isNaN(d) && d > 0.0) {
sampleContaminationWasLoaded = true;
break;
}
this.sampleContamination.putAll(sampleContamination);
this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
}
/**
* Returns true if there is some sample contamination present, false otherwise.
* @return {@code true} iff there is some sample contamination
*/
public boolean isSampleContaminationPresent() {
return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded;
}
//Needs to be here because it uses CONTAMINATION_FRACTION
private DefaultHashMap<String,Double> sampleContamination = new DefaultHashMap<String,Double>(CONTAMINATION_FRACTION);
/**
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Hidden
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY;
/**
* Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
* This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
* WARNINGS:
* - This feature will inflate VCF file size considerably.
* - All SNP ALT alleles will be emitted with corresponding 10 PL values.
* - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
*/
@Advanced
@Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
public boolean annotateAllSitesWithPLs = false;
/**
* Creates a Standard caller argument collection with default values.
*/
public StandardCallerArgumentCollection() { }
/**
* "Casts" a caller argument collection into another type.
*
* <p>Common fields values are copied across</p>
* @param clazz the class of the result.
* @param <T> result argument collection class.
* @return never {@code null}.
*/
public <T extends StandardCallerArgumentCollection> T cloneTo(final Class<T> clazz) {
// short cut: just use regular clone if it happens to be the same class.
if (clazz == getClass())
return (T) clone();
try {
final T result = clazz.newInstance();
for (final Field field : getClass().getFields()) {
// just copy common fields.
if (!field.getDeclaringClass().isAssignableFrom(clazz))
continue;
final int fieldModifiers = field.getModifiers();
if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue;
//Use the clone() method if appropriate
if (Cloneable.class.isAssignableFrom(field.getType())) {
Method clone = field.getType().getMethod("clone");
field.set(result, clone.invoke(field.get(this)));
} else
field.set(result,field.get(this));
}
return result;
} catch (final Exception ex) {
throw new IllegalStateException(ex);
}
}
/**
* Creates a copy of this configuration.
* @return never {@code null}.
*/
@Override
public StandardCallerArgumentCollection clone() {
try {
StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone();
cloned.genotypeArgs = genotypeArgs.clone();
return cloned;
} catch (CloneNotSupportedException e) {
throw new IllegalStateException("unreachable code");
}
}
/**
* Holds a modifiers mask that identifies those fields that cannot be copied between
* StandardCallerArgumentCollections.
*/
private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL;
}

Some files were not shown because too many files have changed in this diff Show More