Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
e5ce5e265a
299
build.xml
299
build.xml
|
|
@ -163,6 +163,14 @@
|
|||
<!-- Remove old versions of ivy jars AFTER the ivy:retrieve has been class loaded. -->
|
||||
<delete file="${ivy.jar.dir}/ivy-2.0.0.jar"/>
|
||||
<delete file="${ivy.jar.dir}/ivy-2.2.0-rc1.jar"/>
|
||||
<!--
|
||||
An old versions of the ivy-1.4.1.xml does not contain /ivy-module/configuration/conf/@name="compile".
|
||||
Easier to upgrade to 1.4.4 than try to deal with xmlproperty and conditional deletion in ant.
|
||||
Just in case we remove explicit 1.4.4 and go back to 1.4.1, try to clean out the file for now.
|
||||
-->
|
||||
<delete file="${ivy.home}/cache/javax.mail/mail/ivy-1.4.1.xml"/>
|
||||
<delete file="${ivy.home}/cache/javax.mail/mail/ivydata-1.4.1.properties"/>
|
||||
<delete file="${ivy.home}/cache/javax.mail/mail/jars/mail-1.4.1.jar"/>
|
||||
</target>
|
||||
|
||||
<target name="init.buildall">
|
||||
|
|
@ -709,53 +717,6 @@
|
|||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.init.compile">
|
||||
<mkdir dir="${java.test.classes}"/>
|
||||
<mkdir dir="${scala.test.classes}"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.java.compile" depends="init.buildall,dist,test.init.compile">
|
||||
<echo message="Sting: Compiling test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src path="${java.public.test.sources}"/>
|
||||
<src path="${java.private.test.sources}"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
<!--
|
||||
<compilerarg value="-Acom.google.java.contract.debug"/>
|
||||
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
|
||||
-->
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.scala.compile" depends="test.java.compile,scala.compile" if="scala.include">
|
||||
<echo message="Scala: Compiling test cases!"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src path="${scala.public.test.sources}" />
|
||||
<src path="${scala.private.test.sources}" />
|
||||
<include name="**/*.scala"/>
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<pathelement location="${scala.test.classes}"/>
|
||||
<pathelement location="${java.test.classes}"/>
|
||||
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
|
||||
|
||||
<!-- new scala target -->
|
||||
|
||||
<target name="scala" description="build the scala directory">
|
||||
|
|
@ -769,20 +730,113 @@
|
|||
<!-- ***************************************************************************** -->
|
||||
<!-- where to put reports and tests-->
|
||||
<property name="report" value="${build.dir}/report"/>
|
||||
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
|
||||
<property name="test.output" value="${dist.dir}/test"/>
|
||||
<property name="java.public.test.sources" value="public/java/test"/>
|
||||
<property name="java.private.test.sources" value="private/java/test"/>
|
||||
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
|
||||
<property name="java.public.test.classes" value="${java.test.classes}/public"/>
|
||||
<property name="java.private.test.classes" value="${java.test.classes}/private"/>
|
||||
<property name="java.public.test.sources" value="${public.dir}/java/test"/>
|
||||
<property name="java.private.test.sources" value="${private.dir}/java/test"/>
|
||||
<property name="scala.test.classes" value="${build.dir}/scala/testclasses"/>
|
||||
<property name="scala.public.test.sources" value="public/scala/test"/>
|
||||
<property name="scala.private.test.sources" value="private/scala/test"/>
|
||||
<property name="scala.public.test.classes" value="${scala.test.classes}/public"/>
|
||||
<property name="scala.private.test.classes" value="${scala.test.classes}/private"/>
|
||||
<property name="scala.public.test.sources" value="${public.dir}/scala/test"/>
|
||||
<property name="scala.private.test.sources" value="${private.dir}/scala/test"/>
|
||||
<property name="testng.jar" value="${lib.dir}/testng-5.14.1.jar"/>
|
||||
<!-- provide a ceiling on the memory that unit/integration tests can consume. -->
|
||||
<property name="test.maxmemory" value="4g"/>
|
||||
|
||||
<target name="test.init.compile">
|
||||
<mkdir dir="${java.test.classes}"/>
|
||||
<mkdir dir="${scala.test.classes}"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.java.public.compile" depends="dist,test.init.compile">
|
||||
<mkdir dir="${java.public.test.classes}"/>
|
||||
<echo message="Sting: Compiling public test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.public.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src path="${java.public.test.sources}"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
<!--
|
||||
<compilerarg value="-Acom.google.java.contract.debug"/>
|
||||
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
|
||||
-->
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.java.private.compile" depends="dist,test.init.compile,test.java.public.compile" if="include.private">
|
||||
<mkdir dir="${java.private.test.classes}"/>
|
||||
<echo message="Sting: Compiling private test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.private.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src path="${java.private.test.sources}"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
<!--
|
||||
<compilerarg value="-Acom.google.java.contract.debug"/>
|
||||
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
|
||||
-->
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile"/>
|
||||
|
||||
<target name="test.scala.public.compile" depends="test.java.compile,scala.compile" if="scala.include">
|
||||
<mkdir dir="${scala.public.test.classes}"/>
|
||||
<echo message="Scala: Compiling public test cases!"/>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.public.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src path="${scala.public.test.sources}" />
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
<target name="test.scala.private.compile" depends="test.java.compile,scala.compile,test.scala.public.compile" if="include.scala.private">
|
||||
<mkdir dir="${scala.private.test.classes}"/>
|
||||
<echo message="Scala: Compiling private test cases!"/>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.private.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src path="${scala.private.test.sources}" />
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<pathelement location="${scala.public.test.classes}"/>
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${java.private.test.classes}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
<target name="test.scala.compile" depends="test.scala.public.compile,test.scala.private.compile"/>
|
||||
|
||||
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
|
||||
|
||||
<!-- TEST -->
|
||||
<macrodef name="run-test">
|
||||
<attribute name="testtype"/>
|
||||
<attribute name="outputdir"/>
|
||||
<attribute name="runfailed"/>
|
||||
|
||||
<sequential>
|
||||
<condition property="run.failed.tests">
|
||||
<equals arg1="@{runfailed}" arg2="true"/>
|
||||
</condition>
|
||||
|
||||
<!-- Get the pipeline run type. Default to dry. -->
|
||||
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
|
||||
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
|
||||
|
|
@ -792,10 +846,10 @@
|
|||
<isset property="include.contracts" />
|
||||
</condition>
|
||||
|
||||
<mkdir dir="${report}/@{testtype}"/>
|
||||
<mkdir dir="@{outputdir}"/>
|
||||
<echo message="Sting: Running @{testtype} test cases!"/>
|
||||
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
|
||||
<testng outputDir="${report}/@{testtype}"
|
||||
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
|
||||
<testng outputDir="@{outputdir}"
|
||||
haltOnFailure="false" failureProperty="test.failure"
|
||||
verbose="2"
|
||||
workingDir="${basedir}"
|
||||
|
|
@ -813,117 +867,108 @@
|
|||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<pathelement location="${java.contracts}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
<pathelement location="${java.public.test.classes}" />
|
||||
<pathelement location="${java.private.test.classes}" />
|
||||
<pathelement location="${scala.public.test.classes}" />
|
||||
<pathelement location="${scala.private.test.classes}" />
|
||||
</classpath>
|
||||
|
||||
<classfileset dir="${java.test.classes}" includes="**/@{testtype}.class"/>
|
||||
<classfileset dir="${scala.test.classes}" includes="**/@{testtype}*.class" />
|
||||
<classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
|
||||
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}.class" if="include.private"/>
|
||||
</classfileset>
|
||||
<classfileset dir="${scala.public.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}*.class" if="scala.include"/>
|
||||
</classfileset>
|
||||
<classfileset dir="${scala.private.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}*.class" if="include.scala.private"/>
|
||||
</classfileset>
|
||||
|
||||
<xmlfileset dir="${basedir}">
|
||||
<include name="@{testtype}" if="run.failed.tests"/>
|
||||
</xmlfileset>
|
||||
</testng>
|
||||
|
||||
<!-- generate a report for Bamboo or Hudson to read in -->
|
||||
<junitreport todir="${report}/@{testtype}">
|
||||
<fileset dir="${report}/@{testtype}">
|
||||
<junitreport todir="@{outputdir}">
|
||||
<fileset dir="@{outputdir}">
|
||||
<include name="*/*.xml"/>
|
||||
</fileset>
|
||||
<report format="noframes" todir="${report}/@{testtype}"/>
|
||||
<report format="noframes" todir="@{outputdir}"/>
|
||||
</junitreport>
|
||||
<fail message="test failed" if="test.failure" />
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<!-- FAILED-TEST -->
|
||||
<macrodef name="run-failed-test">
|
||||
<attribute name="xmlfailedtestfile" />
|
||||
<sequential>
|
||||
<!-- Get the pipeline run type. Default to dry. -->
|
||||
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
|
||||
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
|
||||
</condition>
|
||||
|
||||
<condition property="cofoja.jvm.args" value="-javaagent:${cofoja.jar} -Dcom.google.java.contract.log.contract=false" else="">
|
||||
<isset property="include.contracts" />
|
||||
</condition>
|
||||
|
||||
<mkdir dir="${report}/failed_rerun" />
|
||||
<echo message="Sting: Running @{xmlfailedtestfile} test cases!"/>
|
||||
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
|
||||
<testng outputDir="${report}/failed_rerun"
|
||||
haltOnFailure="false" failureProperty="test.failure"
|
||||
verbose="2"
|
||||
workingDir="${basedir}"
|
||||
useDefaultListeners="false"
|
||||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter">
|
||||
<jvmarg value="-Xmx${test.maxmemory}" />
|
||||
<jvmarg value="-Djava.awt.headless=true" />
|
||||
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
|
||||
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
|
||||
<jvmarg line="${cofoja.jvm.args}"/>
|
||||
<!-- <jvmarg value="-Xdebug"/> -->
|
||||
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<pathelement location="${java.contracts}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
</classpath>
|
||||
|
||||
<xmlfileset dir="${basedir}" includes="@{xmlfailedtestfile}" />
|
||||
</testng>
|
||||
|
||||
<fail message="test failed" if="test.failure" />
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<!-- our three different test conditions: Test, IntegrationTest, PerformanceTest -->
|
||||
<target name="test" depends="test.compile" description="Run unit tests">
|
||||
<target name="alltests">
|
||||
<antcall target="test" inheritAll="false"/>
|
||||
<antcall target="integrationtest" inheritAll="false"/>
|
||||
<antcall target="pipelinetest" inheritAll="false"/>
|
||||
</target>
|
||||
|
||||
<target name="alltests.public">
|
||||
<antcall target="test.public" inheritAll="false"/>
|
||||
<antcall target="integrationtest.public" inheritAll="false"/>
|
||||
<antcall target="pipelinetest.public" inheritAll="false"/>
|
||||
</target>
|
||||
|
||||
<!-- Our four different test conditions: Test, IntegrationTest, PerformanceTest, PipelineTest -->
|
||||
<target name="test" depends="init.buildall,test.compile" description="Run unit tests">
|
||||
<condition property="ttype" value="*UnitTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${ttype}"/>
|
||||
<run-test testtype="${ttype}" outputdir="${report}/${ttype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="integrationtest" depends="test.compile" description="Run integration tests">
|
||||
<target name="test.public" depends="init.buildpublic,test"/>
|
||||
|
||||
<target name="integrationtest" depends="init.buildall,test.compile" description="Run integration tests">
|
||||
<condition property="itype" value="*IntegrationTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${itype}"/>
|
||||
<run-test testtype="${itype}" outputdir="${report}/${itype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="performancetest" depends="test.compile" description="Run performance tests">
|
||||
<target name="integrationtest.public" depends="init.buildpublic,integrationtest"/>
|
||||
|
||||
<target name="performancetest" depends="init.buildall,test.compile" description="Run performance tests">
|
||||
<condition property="ptype" value="*PerformanceTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${ptype}"/>
|
||||
<run-test testtype="${ptype}" outputdir="${report}/${ptype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="pipelinetest" depends="test.compile" description="Run pipeline tests">
|
||||
<target name="performancetest.public" depends="init.buildpublic,performancetest" />
|
||||
|
||||
<target name="pipelinetest" depends="init.buildall,test.compile" description="Run pipeline tests">
|
||||
<condition property="pipetype" value="*PipelineTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${pipetype}"/>
|
||||
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="pipelinetestrun" depends="test.compile" description="Run pipeline tests">
|
||||
<target name="pipelinetest.public" depends="init.buildpublic,pipelinetest" />
|
||||
|
||||
<target name="pipelinetestrun" depends="init.buildall,test.compile" description="Run pipeline tests">
|
||||
<property name="pipeline.run" value="run"/>
|
||||
<condition property="pipetype" value="*PipelineTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${pipetype}"/>
|
||||
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="pipelinetestrun.public" depends="init.buildpublic,pipelinetestrun" />
|
||||
|
||||
<target name="failed-test" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*UnitTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-test" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*UnitTest/testng-failed.xml" />
|
||||
<target name="failed-integration" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*IntegrationTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-integration" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*IntegrationTest/testng-failed.xml" />
|
||||
<target name="failed-performance" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*PerformanceTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-performance" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*PerformanceTest/testng-failed.xml" />
|
||||
</target>
|
||||
|
||||
<target name="failed-pipeline" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*PipelineTest/testng-failed.xml" />
|
||||
<target name="failed-pipeline" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*PipelineTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<!-- ******************************************************************************** -->
|
||||
|
|
|
|||
6
ivy.xml
6
ivy.xml
|
|
@ -15,10 +15,8 @@
|
|||
<!-- Tribble -->
|
||||
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
|
||||
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15">
|
||||
<!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
|
||||
<exclude org="javax.mail" />
|
||||
</dependency>
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
<dependency org="jboss" name="javassist" rev="3.7.ga"/>
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
|
||||
|
|
|
|||
|
|
@ -12,14 +12,14 @@ if ( onCMDLine ) {
|
|||
inputFileName = args[1]
|
||||
outputPDF = args[2]
|
||||
} else {
|
||||
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
|
||||
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
||||
inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
|
||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
|
||||
outputPDF = NA
|
||||
}
|
||||
|
||||
RUNTIME_UNITS = "(sec)"
|
||||
ORIGINAL_UNITS_TO_SECONDS = 1/1000
|
||||
RUNTIME_UNITS = "(hours)"
|
||||
ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
|
||||
|
||||
#
|
||||
# Helper function to aggregate all of the jobs in the report across all tables
|
||||
|
|
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
|
|||
#
|
||||
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
|
||||
#
|
||||
plotJobsGantt <- function(gatkReport, sortOverall) {
|
||||
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
|
||||
allJobs = allJobsFromReport(gatkReport)
|
||||
if ( sortOverall ) {
|
||||
title = "All jobs, by analysis, by start time"
|
||||
|
|
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
|
|||
}
|
||||
allJobs$index = 1:nrow(allJobs)
|
||||
minTime = min(allJobs$startTime)
|
||||
allJobs$relStartTime = allJobs$startTime - minTime
|
||||
allJobs$relDoneTime = allJobs$doneTime - minTime
|
||||
allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
|
||||
allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
|
||||
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
|
||||
maxRelTime = max(allJobs$relDoneTime)
|
||||
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
|
||||
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
|
||||
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
|
||||
p <- p + theme_bw()
|
||||
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
|
||||
if ( includeText )
|
||||
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
|
||||
p <- p + xlim(0, maxRelTime * 1.1)
|
||||
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
|
||||
p <- p + ylab("Job")
|
||||
p <- p + ylab("Job number")
|
||||
p <- p + opts(title=title)
|
||||
print(p)
|
||||
}
|
||||
|
|
@ -140,6 +142,8 @@ print(paste("Project :", inputFileName))
|
|||
convertUnits <- function(gatkReportData) {
|
||||
convertGroup <- function(g) {
|
||||
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
|
||||
g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
|
||||
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
|
||||
g
|
||||
}
|
||||
lapply(gatkReportData, convertGroup)
|
||||
|
|
@ -155,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
|
|||
pdf(outputPDF, height=8.5, width=11)
|
||||
}
|
||||
|
||||
plotJobsGantt(gatkReportData, T)
|
||||
plotJobsGantt(gatkReportData, F)
|
||||
plotJobsGantt(gatkReportData, T, F)
|
||||
plotJobsGantt(gatkReportData, F, F)
|
||||
plotProgressByTime(gatkReportData)
|
||||
for ( group in gatkReportData ) {
|
||||
plotGroup(group)
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
|
||||
private String OUTPUT_DIR = "analyzeCovariates/";
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.12.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "public/R/";
|
||||
|
|
|
|||
|
|
@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
}
|
||||
|
||||
if ( tribbleType == null )
|
||||
if ( ! file.canRead() | !! file.isFile() ) {
|
||||
if ( ! file.canRead() | ! file.isFile() ) {
|
||||
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
|
||||
} else {
|
||||
throw new UserException.CommandLineException(
|
||||
|
|
|
|||
|
|
@ -929,6 +929,14 @@ public class GenomeAnalysisEngine {
|
|||
return readsDataSource.getHeader(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary for this GATK engine instance
|
||||
* @return a never-null dictionary listing all of the contigs known to this engine instance
|
||||
*/
|
||||
public SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return getReferenceDataSource().getReference().getSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source object encapsulating all essential info and handlers used to traverse
|
||||
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
|
|
@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
|
||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
|
||||
currentLocus = null;
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -215,6 +215,45 @@ public class GATKBAMIndex {
|
|||
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use to get close to the unmapped reads at the end of a BAM file.
|
||||
* @return The file offset of the first record in the last linear bin, or -1
|
||||
* if there are no elements in linear bins (i.e. no mapped reads).
|
||||
*/
|
||||
public long getStartOfLastLinearBin() {
|
||||
openIndexFile();
|
||||
|
||||
seek(4);
|
||||
|
||||
final int sequenceCount = readInteger();
|
||||
// Because no reads may align to the last sequence in the sequence dictionary,
|
||||
// grab the last element of the linear index for each sequence, and return
|
||||
// the last one from the last sequence that has one.
|
||||
long lastLinearIndexPointer = -1;
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
// System.out.println("# Sequence TID: " + i);
|
||||
final int nBins = readInteger();
|
||||
// System.out.println("# nBins: " + nBins);
|
||||
for (int j1 = 0; j1 < nBins; j1++) {
|
||||
// Skip bin #
|
||||
skipBytes(4);
|
||||
final int nChunks = readInteger();
|
||||
// Skip chunks
|
||||
skipBytes(16 * nChunks);
|
||||
}
|
||||
final int nLinearBins = readInteger();
|
||||
if (nLinearBins > 0) {
|
||||
// Skip to last element of list of linear bins
|
||||
skipBytes(8 * (nLinearBins - 1));
|
||||
lastLinearIndexPointer = readLongs(1)[0];
|
||||
}
|
||||
}
|
||||
|
||||
closeIndexFile();
|
||||
|
||||
return lastLinearIndexPointer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the possible number of bins for a given reference sequence.
|
||||
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
|||
*/
|
||||
public FilePointer next() {
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
|
||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
return current;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
// If the region contains location information (in other words, it is not at
|
||||
// the start of the unmapped region), add the region.
|
||||
if(currentFilePointer.isRegionUnmapped) {
|
||||
// If the region is unmapped and no location data exists, add a null as an indicator to
|
||||
// start at the next unmapped region.
|
||||
if(!isIntoUnmappedRegion) {
|
||||
selectedReaders.put(id,null);
|
||||
isIntoUnmappedRegion = true;
|
||||
}
|
||||
else
|
||||
selectedReaders.put(id,position.get(id));
|
||||
}
|
||||
else {
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.examples;
|
|||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -59,6 +60,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
@Hidden
|
||||
public class GATKDocsExample extends RodWalker<Integer, Integer> {
|
||||
/**
|
||||
* Put detailed documentation about the argument here. No need to duplicate the summary information
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class PlatformFilter extends ReadFilter {
|
||||
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false)
|
||||
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false)
|
||||
protected String[] PLFilterNames;
|
||||
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
|||
else if ( stub.getOutputStream() != null ) {
|
||||
this.file = null;
|
||||
this.stream = stub.getOutputStream();
|
||||
writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
|
||||
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
|
||||
|
|
@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
|||
}
|
||||
|
||||
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
|
||||
return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.io.stubs;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.gatk.CommandLineExecutable;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
return isCompressed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary from the engine associated with this stub
|
||||
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
|
||||
* @return
|
||||
*/
|
||||
public SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return engine.getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we tell the VCF writer not to write genotypes?
|
||||
* @return true if the writer should not write genotypes.
|
||||
|
|
|
|||
|
|
@ -293,15 +293,16 @@ public class GATKRunReport {
|
|||
* That is, postReport() is guarenteed not to fail for any reason.
|
||||
*/
|
||||
private File postReportToLocalDisk(File rootDir) {
|
||||
String filename = getID() + ".report.xml.gz";
|
||||
File file = new File(rootDir, filename);
|
||||
try {
|
||||
String filename = getID() + ".report.xml.gz";
|
||||
File file = new File(rootDir, filename);
|
||||
postReportToFile(file);
|
||||
logger.debug("Wrote report to " + file);
|
||||
return file;
|
||||
} catch ( Exception e ) {
|
||||
// we catch everything, and no matter what eat the error
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
file.delete();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -312,6 +313,7 @@ public class GATKRunReport {
|
|||
File localFile = postReportToLocalDisk(new File("./"));
|
||||
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
|
||||
if ( localFile != null ) { // we succeeded in creating the local file
|
||||
localFile.deleteOnExit();
|
||||
try {
|
||||
// stop us from printing the annoying, and meaningless, mime types warning
|
||||
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
||||
|
|
@ -336,14 +338,13 @@ public class GATKRunReport {
|
|||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||
logger.debug("Uploaded to AWS: " + s3Object);
|
||||
logger.info("Uploaded run statistics report to AWS S3");
|
||||
} catch ( S3ServiceException e ) {
|
||||
exceptDuringRunReport("S3 exception occurred", e);
|
||||
} catch ( NoSuchAlgorithmException e ) {
|
||||
exceptDuringRunReport("Couldn't calculate MD5", e);
|
||||
} catch ( IOException e ) {
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
} finally {
|
||||
localFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram {
|
|||
Index index = IndexFactory.createIndex(inputFileSource, codec, approach);
|
||||
|
||||
// add writing of the sequence dictionary, if supplied
|
||||
builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false);
|
||||
builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary());
|
||||
|
||||
// create the output stream, and write the index
|
||||
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Utilities for working with Sequence Dictionaries embedded in tribble indices
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class IndexDictionaryUtils {
|
||||
private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class);
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
|
||||
* @param index the index file to use
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
|
||||
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
|
||||
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
|
||||
Integer.valueOf(entry.getValue())));
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence dictionary with the contig list; a backup approach
|
||||
* @param index the index file to use
|
||||
* @param dict the sequence dictionary to add contigs to
|
||||
* @return the filled-in sequence dictionary
|
||||
*/
|
||||
static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
|
||||
LinkedHashSet<String> seqNames = index.getSequenceNames();
|
||||
if (seqNames == null) {
|
||||
return dict;
|
||||
}
|
||||
for (String name : seqNames) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
|
||||
for ( SAMSequenceRecord seq : dict.getSequences() ) {
|
||||
final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName();
|
||||
final String length = String.valueOf(seq.getSequenceLength());
|
||||
index.addProperty(contig,length);
|
||||
}
|
||||
}
|
||||
|
||||
public static void validateTrackSequenceDictionary(final String trackName,
|
||||
final SAMSequenceDictionary trackDict,
|
||||
final SAMSequenceDictionary referenceDict,
|
||||
final ValidationExclusion.TYPE validationExclusionType ) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
|
|
@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* @author aaron
|
||||
* `
|
||||
* Class RMDTrackBuilder
|
||||
*
|
||||
|
|
@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
|
||||
public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false;
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
// private sequence dictionary we use to set our tracks with
|
||||
private SAMSequenceDictionary dict = null;
|
||||
|
||||
|
|
@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); }
|
||||
catch (ReviewedStingException e) { }
|
||||
|
||||
sequenceDictionary = getSequenceDictionaryFromProperties(index);
|
||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
|
||||
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
|
||||
if (sequenceDictionary.size() == 0 && dict != null) {
|
||||
File indexFile = Tribble.indexFile(inputFile);
|
||||
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
|
||||
sequenceDictionary = getSequenceDictionaryFromProperties(index);
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
|
||||
try { // re-write the index
|
||||
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
|
||||
} catch (IOException e) {
|
||||
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
|
||||
}
|
||||
|
||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
}
|
||||
|
||||
if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE )
|
||||
|
|
@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
// this can take a while, let them know what we're doing
|
||||
logger.info("Creating Tribble index in memory for file " + inputFile);
|
||||
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict);
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
// static functions to work with the sequence dictionaries of indexes
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
|
||||
* @param index the index file to use
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
|
||||
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
|
||||
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
|
||||
Integer.valueOf(entry.getValue())));
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence dictionary with the contig list; a backup approach
|
||||
* @param index the index file to use
|
||||
* @param dict the sequence dictionary to add contigs to
|
||||
* @return the filled-in sequence dictionary
|
||||
*/
|
||||
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
|
||||
LinkedHashSet<String> seqNames = index.getSequenceNames();
|
||||
if (seqNames == null) {
|
||||
return dict;
|
||||
}
|
||||
for (String name : seqNames) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
|
||||
* (that each contig in the index is in the sequence dictionary).
|
||||
* @param inputFile for proper error message formatting.
|
||||
* @param dict the sequence dictionary
|
||||
* @param index the index file
|
||||
* @param indexFile the index file
|
||||
* @param rewriteIndex should we rewrite the index when we're done?
|
||||
*
|
||||
*/
|
||||
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
|
||||
if (dict == null) return;
|
||||
|
||||
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
|
||||
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
|
||||
public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) {
|
||||
if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null");
|
||||
|
||||
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
|
||||
for (SAMSequenceRecord seq : currentDict.getSequences()) {
|
||||
if (dict.getSequence(seq.getSequenceName()) == null)
|
||||
continue;
|
||||
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
|
||||
}
|
||||
// re-write the index
|
||||
if (rewriteIndex) try {
|
||||
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
|
||||
} catch (IOException e) {
|
||||
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
|
||||
}
|
||||
final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
|
||||
validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict);
|
||||
|
||||
// actually update the dictionary in the index
|
||||
IndexDictionaryUtils.setIndexSequenceDictionary(index, dict);
|
||||
}
|
||||
|
||||
|
||||
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
}
|
||||
public void validateTrackSequenceDictionary(final String trackName,
|
||||
final SAMSequenceDictionary trackDict,
|
||||
final SAMSequenceDictionary referenceDict ) {
|
||||
IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,12 +24,14 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Tracks a linked list of GATKReportColumn in order by name.
|
||||
*/
|
||||
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> {
|
||||
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
|
||||
private List<String> columnNames = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
|
|
@ -52,4 +54,14 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> {
|
|||
columnNames.add(key);
|
||||
return super.put(key, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<GATKReportColumn> iterator() {
|
||||
return new Iterator<GATKReportColumn>() {
|
||||
int offset = 0;
|
||||
public boolean hasNext() { return offset < columnNames.size() ; }
|
||||
public GATKReportColumn next() { return getByIndex(offset++); }
|
||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -286,6 +286,10 @@ public class GATKReportTable {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean containsKey(Object primaryKey) {
|
||||
return primaryKeyColumn.contains(primaryKey);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the value for a given position in the table
|
||||
*
|
||||
|
|
|
|||
|
|
@ -358,7 +358,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
public void printOnTraversalDone() {
|
||||
printProgress(null, null, true);
|
||||
|
||||
final double elapsed = timer.getElapsedTime();
|
||||
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
|
||||
|
||||
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics();
|
||||
|
||||
|
|
|
|||
|
|
@ -26,21 +26,23 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
|
||||
* of the given ROD.
|
||||
*/
|
||||
public class PrintRODsWalker extends RodWalker<Integer, Integer> {
|
||||
@Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true)
|
||||
public RodBinding<Feature> input;
|
||||
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
|
|
@ -62,7 +64,7 @@ public class PrintRODsWalker extends RodWalker<Integer, Integer> {
|
|||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
for ( Feature feature : tracker.getValues(Feature.class) ) {
|
||||
for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) {
|
||||
out.println(feature.toString());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
|||
* -I input1.bam \
|
||||
* -I input2.bam \
|
||||
* --read_filter MappingQualityZero
|
||||
*
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T PrintReads \
|
||||
* -o output.bam \
|
||||
* -I input.bam \
|
||||
* -n 2000
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -77,6 +78,15 @@ public abstract class Walker<MapType, ReduceType> {
|
|||
return toolkit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary for this walker
|
||||
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
|
||||
* @return
|
||||
*/
|
||||
protected SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return getToolkit().getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* (conceptual static) method that states whether you want to see reads piling up at a locus
|
||||
* that contain a deletion at the locus.
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* The allele balance (fraction of ref bases over ref + alt bases) across all bialleleic het-called samples
|
||||
*/
|
||||
public class AlleleBalance extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -16,6 +16,9 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* The allele balance (fraction of ref bases over ref + alt bases) separately for each bialleleic het-called sample
|
||||
*/
|
||||
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
|
||||
|
|
|
|||
|
|
@ -6,8 +6,9 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Abstract base class for all annotations that are normalized by depth
|
||||
*/
|
||||
public abstract class AnnotationByDepth extends InfoFieldAnnotation {
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Count of A, C, G, T bases across all samples
|
||||
*/
|
||||
public class BaseCounts extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -13,6 +13,9 @@ import java.util.LinkedHashMap;
|
|||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele)
|
||||
*/
|
||||
public class BaseQualityRankSumTest extends RankSumTest {
|
||||
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
|
||||
|
||||
|
|
|
|||
|
|
@ -44,6 +44,11 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Allele count in genotypes, for each ALT allele, in the same order as listed;
|
||||
* allele Frequency, for each ALT allele, in the same order as listed; total number
|
||||
* of alleles in called genotypes.
|
||||
*/
|
||||
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY };
|
||||
|
|
|
|||
|
|
@ -16,7 +16,23 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Total (unfiltered) depth over all samples.
|
||||
*
|
||||
* This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample
|
||||
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
|
||||
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
|
||||
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
|
||||
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
|
||||
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
|
||||
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
|
||||
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
|
||||
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
|
||||
* normally be excluded from the statistical calculations going into GQ and QUAL.
|
||||
*
|
||||
* Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
|
||||
* -dcov D is N * D
|
||||
*/
|
||||
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -23,6 +23,25 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* The depth of coverage of each VCF allele in this sample.
|
||||
*
|
||||
* This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample
|
||||
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
|
||||
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
|
||||
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
|
||||
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
|
||||
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
|
||||
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
|
||||
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
|
||||
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
|
||||
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
|
||||
* the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
|
||||
* are actually present and correctly left-aligned in the alignments themselves). Because of this fact and
|
||||
* because the AD includes reads and bases that were filtered by the Unified Genotyper, <b>one should not base
|
||||
* assumptions about the underlying genotype based on it</b>; instead, the genotype likelihoods (PLs) are what
|
||||
* determine the genotype calls (see below).
|
||||
*/
|
||||
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
|
||||
|
||||
private static String REF_ALLELE = "REF";
|
||||
|
|
|
|||
|
|
@ -43,6 +43,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
|
||||
* being seen on only the forward or only the reverse strand) in the reads? More bias is
|
||||
* indicative of false positive calls.
|
||||
*/
|
||||
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private static final String FS = "FS";
|
||||
private static final double MIN_PVALUE = 1E-320;
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site
|
||||
*/
|
||||
public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -34,12 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
|
@ -49,6 +49,10 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
|
||||
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
|
||||
*/
|
||||
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private final static boolean DEBUG = false;
|
||||
private final static int MIN_CONTEXT_WING_SIZE = 10;
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium
|
||||
*/
|
||||
public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation {
|
||||
|
||||
private static final int MIN_SAMPLES = 10;
|
||||
|
|
|
|||
|
|
@ -16,7 +16,9 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Largest contiguous homopolymer run of the variant allele in either direction on the reference.
|
||||
*/
|
||||
public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
private boolean ANNOTATE_INDELS = true;
|
||||
|
|
|
|||
|
|
@ -17,14 +17,15 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 5/16/11
|
||||
*/
|
||||
|
||||
// A set of annotations calculated directly from the GLs
|
||||
public class GLstats extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
/**
|
||||
* Likelihood-based (using PL field) test for the inbreeding among samples.
|
||||
*
|
||||
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
|
||||
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
|
||||
* more information.
|
||||
*/
|
||||
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
private static final int MIN_SAMPLES = 10;
|
||||
|
||||
|
|
@ -14,11 +14,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: Mar 11, 2011
|
||||
* Time: 11:47:33 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* Rough category of indel type (insertion, deletion, multi-allelic, other)
|
||||
*/
|
||||
public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Triplet annotation: fraction of MAQP == 0, MAPQ < 10, and count of all mapped reads
|
||||
*/
|
||||
public class LowMQ extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,9 @@ import java.util.LinkedHashMap;
|
|||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
|
||||
*/
|
||||
public class MappingQualityRankSumTest extends RankSumTest {
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Total count across all samples of mapping quality zero reads
|
||||
*/
|
||||
public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -1,85 +1,81 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Feb 4, 2011
|
||||
* Time: 6:46:25 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class MappingQualityZeroBySample extends GenotypeAnnotation {
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker,
|
||||
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
|
||||
if ( g == null || !g.isCalled() )
|
||||
return null;
|
||||
|
||||
int mq0 = 0;
|
||||
ReadBackedPileup pileup = null;
|
||||
if (vc.isIndel() && context.hasExtendedEventPileup())
|
||||
pileup = context.getExtendedEventPileup();
|
||||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
else return null;
|
||||
|
||||
if (pileup != null) {
|
||||
for (PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() == 0 )
|
||||
mq0++;
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%d", mq0));
|
||||
return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(
|
||||
new VCFFormatHeaderLine(getKeyNames().get(0), 1,
|
||||
VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); }
|
||||
|
||||
|
||||
}
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Count for each sample of mapping quality zero reads
|
||||
*/
|
||||
public class MappingQualityZeroBySample extends GenotypeAnnotation {
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker,
|
||||
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
|
||||
if ( g == null || !g.isCalled() )
|
||||
return null;
|
||||
|
||||
int mq0 = 0;
|
||||
ReadBackedPileup pileup = null;
|
||||
if (vc.isIndel() && context.hasExtendedEventPileup())
|
||||
pileup = context.getExtendedEventPileup();
|
||||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
else return null;
|
||||
|
||||
if (pileup != null) {
|
||||
for (PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() == 0 )
|
||||
mq0++;
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%d", mq0));
|
||||
return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(
|
||||
new VCFFormatHeaderLine(getKeyNames().get(0), 1,
|
||||
VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); }
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,8 +17,9 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Fraction of all reads across samples that have mapping quality zero
|
||||
*/
|
||||
public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -17,11 +17,8 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 5/16/11
|
||||
* The number of N bases, counting only SOLiD data
|
||||
*/
|
||||
|
||||
public class NBaseCount extends InfoFieldAnnotation {
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if( stratifiedContexts.size() == 0 )
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -15,7 +16,11 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Variant confidence (given as (AB+BB)/AA from the PLs) / unfiltered depth.
|
||||
*
|
||||
* Low scores are indicative of false positive calls and artifacts.
|
||||
*/
|
||||
public class QualByDepth extends AnnotationByDepth implements StandardAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Root Mean Square of the mapping quality of the reads across all samples.
|
||||
*/
|
||||
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -21,7 +21,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Abstract root for all RankSum based annotations
|
||||
*/
|
||||
public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
static final double INDEL_LIKELIHOOD_THRESH = 0.1;
|
||||
static final boolean DEBUG = false;
|
||||
|
|
|
|||
|
|
@ -1,209 +1,207 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Feb 4, 2011
|
||||
* Time: 3:59:27 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
|
||||
|
||||
private static String REF_ALLELE = "REF";
|
||||
|
||||
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
|
||||
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
|
||||
if ( g == null || !g.isCalled() )
|
||||
return null;
|
||||
|
||||
if ( vc.isSNP() )
|
||||
return annotateSNP(stratifiedContext, vc);
|
||||
if ( vc.isIndel() )
|
||||
return annotateIndel(stratifiedContext, vc);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
|
||||
|
||||
if ( ! stratifiedContext.hasBasePileup() ) return null;
|
||||
|
||||
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
|
||||
for ( Allele allele : vc.getAlternateAlleles() )
|
||||
alleleCounts.put(allele.getBases()[0], 0);
|
||||
|
||||
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
int totalDepth = pileup.size();
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
|
||||
|
||||
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
|
||||
|
||||
int mq0 = 0; // number of "ref" reads that are acually mq0
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() == 0 ) {
|
||||
mq0++;
|
||||
continue;
|
||||
}
|
||||
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
|
||||
}
|
||||
|
||||
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
|
||||
|
||||
// we need to add counts in the correct order
|
||||
String[] fracs = new String[alleleCounts.size()];
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
|
||||
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
|
||||
}
|
||||
|
||||
map.put(getKeyNames().get(1), fracs);
|
||||
return map;
|
||||
}
|
||||
|
||||
private Map<String,Object> annotateIndel(AlignmentContext
|
||||
stratifiedContext, VariantContext
|
||||
vc) {
|
||||
|
||||
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
|
||||
if ( pileup == null )
|
||||
return null;
|
||||
int totalDepth = pileup.size();
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
|
||||
|
||||
if ( totalDepth == 0 ) return map;
|
||||
int mq0 = 0; // number of "ref" reads that are acually mq0
|
||||
|
||||
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
|
||||
Allele refAllele = vc.getReference();
|
||||
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
|
||||
if ( allele.isNoCall() ) {
|
||||
continue; // this does not look so good, should we die???
|
||||
}
|
||||
|
||||
alleleCounts.put(getAlleleRepresentation(allele), 0);
|
||||
}
|
||||
|
||||
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
|
||||
|
||||
if ( e.getMappingQual() == 0 ) {
|
||||
mq0++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( e.isInsertion() ) {
|
||||
|
||||
final String b = e.getEventBases();
|
||||
if ( alleleCounts.containsKey(b) ) {
|
||||
alleleCounts.put(b, alleleCounts.get(b)+1);
|
||||
}
|
||||
|
||||
} else {
|
||||
if ( e.isDeletion() ) {
|
||||
if ( e.getEventLength() == refAllele.length() ) {
|
||||
// this is indeed the deletion allele recorded in VC
|
||||
final String b = DEL;
|
||||
if ( alleleCounts.containsKey(b) ) {
|
||||
alleleCounts.put(b, alleleCounts.get(b)+1);
|
||||
}
|
||||
}
|
||||
// else {
|
||||
// System.out.print(" deletion of WRONG length found");
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( mq0 == totalDepth ) return map;
|
||||
|
||||
String[] fracs = new String[alleleCounts.size()];
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
fracs[i] = String.format("%.3f",
|
||||
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
|
||||
|
||||
map.put(getKeyNames().get(1), fracs);
|
||||
|
||||
//map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
}
|
||||
|
||||
private String getAlleleRepresentation(Allele allele) {
|
||||
if ( allele.isNull() ) { // deletion wrt the ref
|
||||
return DEL;
|
||||
} else { // insertion, pass actual bases
|
||||
return allele.getBaseString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// public String getIndelBases()
|
||||
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
|
||||
1,
|
||||
VCFHeaderLineType.Integer,
|
||||
"Total read depth per sample, including MQ0"),
|
||||
new VCFFormatHeaderLine(getKeyNames().get(1),
|
||||
VCFHeaderLineCount.UNBOUNDED,
|
||||
VCFHeaderLineType.Float,
|
||||
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Unsupported
|
||||
*/
|
||||
@Hidden
|
||||
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
|
||||
|
||||
private static String REF_ALLELE = "REF";
|
||||
|
||||
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
|
||||
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
|
||||
if ( g == null || !g.isCalled() )
|
||||
return null;
|
||||
|
||||
if ( vc.isSNP() )
|
||||
return annotateSNP(stratifiedContext, vc);
|
||||
if ( vc.isIndel() )
|
||||
return annotateIndel(stratifiedContext, vc);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
|
||||
|
||||
if ( ! stratifiedContext.hasBasePileup() ) return null;
|
||||
|
||||
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
|
||||
for ( Allele allele : vc.getAlternateAlleles() )
|
||||
alleleCounts.put(allele.getBases()[0], 0);
|
||||
|
||||
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
int totalDepth = pileup.size();
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
|
||||
|
||||
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
|
||||
|
||||
int mq0 = 0; // number of "ref" reads that are acually mq0
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() == 0 ) {
|
||||
mq0++;
|
||||
continue;
|
||||
}
|
||||
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
|
||||
}
|
||||
|
||||
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
|
||||
|
||||
// we need to add counts in the correct order
|
||||
String[] fracs = new String[alleleCounts.size()];
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
|
||||
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
|
||||
}
|
||||
|
||||
map.put(getKeyNames().get(1), fracs);
|
||||
return map;
|
||||
}
|
||||
|
||||
private Map<String,Object> annotateIndel(AlignmentContext
|
||||
stratifiedContext, VariantContext
|
||||
vc) {
|
||||
|
||||
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
|
||||
if ( pileup == null )
|
||||
return null;
|
||||
int totalDepth = pileup.size();
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
|
||||
|
||||
if ( totalDepth == 0 ) return map;
|
||||
int mq0 = 0; // number of "ref" reads that are acually mq0
|
||||
|
||||
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
|
||||
Allele refAllele = vc.getReference();
|
||||
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
|
||||
if ( allele.isNoCall() ) {
|
||||
continue; // this does not look so good, should we die???
|
||||
}
|
||||
|
||||
alleleCounts.put(getAlleleRepresentation(allele), 0);
|
||||
}
|
||||
|
||||
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
|
||||
|
||||
if ( e.getMappingQual() == 0 ) {
|
||||
mq0++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( e.isInsertion() ) {
|
||||
|
||||
final String b = e.getEventBases();
|
||||
if ( alleleCounts.containsKey(b) ) {
|
||||
alleleCounts.put(b, alleleCounts.get(b)+1);
|
||||
}
|
||||
|
||||
} else {
|
||||
if ( e.isDeletion() ) {
|
||||
if ( e.getEventLength() == refAllele.length() ) {
|
||||
// this is indeed the deletion allele recorded in VC
|
||||
final String b = DEL;
|
||||
if ( alleleCounts.containsKey(b) ) {
|
||||
alleleCounts.put(b, alleleCounts.get(b)+1);
|
||||
}
|
||||
}
|
||||
// else {
|
||||
// System.out.print(" deletion of WRONG length found");
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( mq0 == totalDepth ) return map;
|
||||
|
||||
String[] fracs = new String[alleleCounts.size()];
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
fracs[i] = String.format("%.3f",
|
||||
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
|
||||
|
||||
map.put(getKeyNames().get(1), fracs);
|
||||
|
||||
//map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
}
|
||||
|
||||
private String getAlleleRepresentation(Allele allele) {
|
||||
if ( allele.isNull() ) { // deletion wrt the ref
|
||||
return DEL;
|
||||
} else { // insertion, pass actual bases
|
||||
return allele.getBaseString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// public String getIndelBases()
|
||||
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
|
||||
1,
|
||||
VCFHeaderLineType.Integer,
|
||||
"Total read depth per sample, including MQ0"),
|
||||
new VCFFormatHeaderLine(getKeyNames().get(1),
|
||||
VCFHeaderLineCount.UNBOUNDED,
|
||||
VCFHeaderLineType.Float,
|
||||
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,11 +19,8 @@ import java.util.LinkedHashMap;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/30/11
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
|
||||
*/
|
||||
|
||||
public class ReadPosRankSumTest extends RankSumTest {
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("ReadPosRankSum"); }
|
||||
|
|
|
|||
|
|
@ -15,8 +15,9 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* SB annotation value by depth of alt containing samples
|
||||
*/
|
||||
public class SBByDepth extends AnnotationByDepth {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
@ -26,7 +27,7 @@ public class SBByDepth extends AnnotationByDepth {
|
|||
if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY))
|
||||
return null;
|
||||
|
||||
double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY));
|
||||
double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1);
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
|
|
|
|||
|
|
@ -41,7 +41,9 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* List all of the samples in the info field
|
||||
*/
|
||||
public class SampleList extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -24,7 +24,9 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -46,134 +45,522 @@ import java.util.*;
|
|||
* (http://snpeff.sourceforge.net/).
|
||||
*
|
||||
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
|
||||
* output file (which must be provided on the command line via --snpEffFile:SnpEff <filename>),
|
||||
* output file (which must be provided on the command line via --snpEffFile filename.vcf),
|
||||
* and adds annotations on that effect.
|
||||
*
|
||||
* The possible biological effects and their associated impacts are defined in the class:
|
||||
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
// SnpEff annotation key names:
|
||||
public static final String GENE_ID_KEY = "GENE_ID";
|
||||
public static final String GENE_NAME_KEY = "GENE_NAME";
|
||||
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
|
||||
public static final String EXON_ID_KEY = "EXON_ID";
|
||||
public static final String EXON_RANK_KEY = "EXON_RANK";
|
||||
public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
|
||||
public static final String EFFECT_KEY = "EFFECT";
|
||||
public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
|
||||
public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
|
||||
public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
|
||||
public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
|
||||
public static final String CODON_NUM_KEY = "CODON_NUM";
|
||||
public static final String CDS_SIZE_KEY = "CDS_SIZE";
|
||||
private static Logger logger = Logger.getLogger(SnpEff.class);
|
||||
|
||||
// We refuse to parse SnpEff output files generated by unsupported versions, or
|
||||
// lacking a SnpEff version number in the VCF header:
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
|
||||
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
|
||||
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
|
||||
|
||||
// When we write the SnpEff version number and command line to the output VCF, we change
|
||||
// the key name slightly so that the output VCF won't be confused in the future for an
|
||||
// output file produced by SnpEff directly:
|
||||
public static final String OUTPUT_VCF_HEADER_VERSION_LINE_KEY = "Original" + SNPEFF_VCF_HEADER_VERSION_LINE_KEY;
|
||||
public static final String OUTPUT_VCF_HEADER_COMMAND_LINE_KEY = "Original" + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY;
|
||||
|
||||
// SnpEff aggregates all effects (and effect metadata) together into a single INFO
|
||||
// field annotation with the key EFF:
|
||||
public static final String SNPEFF_INFO_FIELD_KEY = "EFF";
|
||||
public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]";
|
||||
public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|";
|
||||
|
||||
// Key names for the INFO field annotations we will add to each record, along
|
||||
// with parsing-related information:
|
||||
public enum InfoFieldKey {
|
||||
EFFECT_KEY ("SNPEFF_EFFECT", -1),
|
||||
IMPACT_KEY ("SNPEFF_IMPACT", 0),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 7),
|
||||
FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1);
|
||||
|
||||
// Actual text of the key
|
||||
private final String keyName;
|
||||
|
||||
// Index within the effect metadata subfields from the SnpEff EFF annotation
|
||||
// where each key's associated value can be found during parsing.
|
||||
private final int fieldIndex;
|
||||
|
||||
InfoFieldKey ( String keyName, int fieldIndex ) {
|
||||
this.keyName = keyName;
|
||||
this.fieldIndex = fieldIndex;
|
||||
}
|
||||
|
||||
public String getKeyName() {
|
||||
return keyName;
|
||||
}
|
||||
|
||||
public int getFieldIndex() {
|
||||
return fieldIndex;
|
||||
}
|
||||
}
|
||||
|
||||
// Possible SnpEff biological effects. All effect names found in the SnpEff input file
|
||||
// are validated against this list.
|
||||
public enum EffectType {
|
||||
// High-impact effects:
|
||||
FRAME_SHIFT (EffectFunctionalClass.NONE, false),
|
||||
STOP_GAINED (EffectFunctionalClass.NONSENSE, false),
|
||||
START_LOST (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false),
|
||||
EXON_DELETED (EffectFunctionalClass.NONE, false),
|
||||
STOP_LOST (EffectFunctionalClass.NONE, false),
|
||||
|
||||
// Moderate-impact effects:
|
||||
NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false),
|
||||
CODON_CHANGE (EffectFunctionalClass.NONE, false),
|
||||
CODON_INSERTION (EffectFunctionalClass.NONE, false),
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false),
|
||||
CODON_DELETION (EffectFunctionalClass.NONE, false),
|
||||
CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false),
|
||||
UTR_5_DELETED (EffectFunctionalClass.NONE, false),
|
||||
UTR_3_DELETED (EffectFunctionalClass.NONE, false),
|
||||
|
||||
// Low-impact effects:
|
||||
SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false),
|
||||
SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
|
||||
NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
|
||||
SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
|
||||
NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
|
||||
START_GAINED (EffectFunctionalClass.NONE, false),
|
||||
|
||||
// Modifiers:
|
||||
NONE (EffectFunctionalClass.NONE, true),
|
||||
CHROMOSOME (EffectFunctionalClass.NONE, true),
|
||||
INTERGENIC (EffectFunctionalClass.NONE, true),
|
||||
UPSTREAM (EffectFunctionalClass.NONE, true),
|
||||
UTR_5_PRIME (EffectFunctionalClass.NONE, true),
|
||||
CDS (EffectFunctionalClass.NONE, true),
|
||||
GENE (EffectFunctionalClass.NONE, true),
|
||||
TRANSCRIPT (EffectFunctionalClass.NONE, true),
|
||||
EXON (EffectFunctionalClass.NONE, true),
|
||||
INTRON (EffectFunctionalClass.NONE, true),
|
||||
UTR_3_PRIME (EffectFunctionalClass.NONE, true),
|
||||
DOWNSTREAM (EffectFunctionalClass.NONE, true),
|
||||
INTRON_CONSERVED (EffectFunctionalClass.NONE, true),
|
||||
INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true),
|
||||
REGULATION (EffectFunctionalClass.NONE, true),
|
||||
CUSTOM (EffectFunctionalClass.NONE, true),
|
||||
WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true);
|
||||
|
||||
private final EffectFunctionalClass functionalClass;
|
||||
private final boolean isModifier;
|
||||
|
||||
EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) {
|
||||
this.functionalClass = functionalClass;
|
||||
this.isModifier = isModifier;
|
||||
}
|
||||
|
||||
public EffectFunctionalClass getFunctionalClass() {
|
||||
return functionalClass;
|
||||
}
|
||||
|
||||
public boolean isModifier() {
|
||||
return isModifier;
|
||||
}
|
||||
}
|
||||
|
||||
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of
|
||||
// classifying some of the LOW impact effects as MODIFIERs.
|
||||
public enum EffectImpact {
|
||||
MODIFIER (0),
|
||||
LOW (1),
|
||||
MODERATE (2),
|
||||
HIGH (3);
|
||||
|
||||
private final int severityRating;
|
||||
|
||||
EffectImpact ( int severityRating ) {
|
||||
this.severityRating = severityRating;
|
||||
}
|
||||
|
||||
public boolean isHigherImpactThan ( EffectImpact other ) {
|
||||
return this.severityRating > other.severityRating;
|
||||
}
|
||||
|
||||
public boolean isSameImpactAs ( EffectImpact other ) {
|
||||
return this.severityRating == other.severityRating;
|
||||
}
|
||||
}
|
||||
|
||||
// SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information.
|
||||
public enum EffectCoding {
|
||||
CODING,
|
||||
NON_CODING,
|
||||
UNKNOWN
|
||||
}
|
||||
|
||||
// We assign a functional class to each SnpEff effect.
|
||||
public enum EffectFunctionalClass {
|
||||
NONE (0),
|
||||
SILENT (1),
|
||||
MISSENSE (2),
|
||||
NONSENSE (3);
|
||||
|
||||
private final int priority;
|
||||
|
||||
EffectFunctionalClass ( int priority ) {
|
||||
this.priority = priority;
|
||||
}
|
||||
|
||||
public boolean isHigherPriorityThan ( EffectFunctionalClass other ) {
|
||||
return this.priority > other.priority;
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
// Make sure that we actually have a valid SnpEff rod binding (just in case the user specified -A SnpEff
|
||||
// without providing a SnpEff rod via --snpEffFile):
|
||||
validateRodBinding(walker.getSnpEffRodBinding());
|
||||
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
|
||||
|
||||
// Make sure that the SnpEff version number and command-line header lines are present in the VCF header of
|
||||
// the SnpEff rod, and that the file was generated by a supported version of SnpEff:
|
||||
VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName());
|
||||
VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY);
|
||||
VCFHeaderLine snpEffCommandLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_COMMAND_LINE_KEY);
|
||||
|
||||
checkSnpEffVersion(snpEffVersionLine);
|
||||
checkSnpEffCommandLine(snpEffCommandLine);
|
||||
|
||||
// If everything looks ok, add the SnpEff version number and command-line header lines to the
|
||||
// header of the VCF output file, changing the key names so that our output file won't be
|
||||
// mistaken in the future for a SnpEff output file:
|
||||
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_VERSION_LINE_KEY, snpEffVersionLine.getValue()));
|
||||
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue()));
|
||||
}
|
||||
|
||||
public Map<String, Object> annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
|
||||
RodBinding<SnpEffFeature> snpEffRodBinding = walker.getSnpEffRodBinding();
|
||||
validateRodBinding(snpEffRodBinding);
|
||||
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
|
||||
|
||||
List<SnpEffFeature> features = tracker.getValues(snpEffRodBinding, ref.getLocus());
|
||||
// Get only SnpEff records that start at this locus, not merely span it:
|
||||
List<VariantContext> snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus());
|
||||
|
||||
// Add only annotations for one of the most biologically-significant effects as defined in
|
||||
// the SnpEffConstants class:
|
||||
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features);
|
||||
|
||||
if ( mostSignificantEffect == null ) {
|
||||
// Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate.
|
||||
// If there is more than one such record, we only need to pick the first one, since the biological
|
||||
// effects will be the same across all such records:
|
||||
VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc);
|
||||
if ( matchingRecord == null ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return generateAnnotations(mostSignificantEffect);
|
||||
// Parse the SnpEff INFO field annotation from the matching record into individual effect objects:
|
||||
List<SnpEffEffect> effects = parseSnpEffRecord(matchingRecord);
|
||||
if ( effects.size() == 0 ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Add only annotations for one of the most biologically-significant effects from this set:
|
||||
SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects);
|
||||
return mostSignificantEffect.getAnnotations();
|
||||
}
|
||||
|
||||
private void validateRodBinding ( RodBinding<SnpEffFeature> snpEffRodBinding ) {
|
||||
private void validateRodBinding ( RodBinding<VariantContext> snpEffRodBinding ) {
|
||||
if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) {
|
||||
throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " +
|
||||
"as a rodbinding on the command line, but no SnpEff rodbinding was found.");
|
||||
throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " +
|
||||
"as a rodbinding on the command line via the --snpEffFile option, but " +
|
||||
"no SnpEff rodbinding was found.");
|
||||
}
|
||||
}
|
||||
|
||||
private SnpEffFeature getMostSignificantEffect ( List<SnpEffFeature> snpEffFeatures ) {
|
||||
SnpEffFeature mostSignificantEffect = null;
|
||||
private void checkSnpEffVersion ( VCFHeaderLine snpEffVersionLine ) {
|
||||
if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) {
|
||||
throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " +
|
||||
"input file, and so could not verify that the file was generated by a supported version of SnpEff (" +
|
||||
Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
|
||||
}
|
||||
|
||||
for ( SnpEffFeature snpEffFeature : snpEffFeatures ) {
|
||||
String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0];
|
||||
|
||||
if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) {
|
||||
throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " +
|
||||
"is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS));
|
||||
}
|
||||
}
|
||||
|
||||
private void checkSnpEffCommandLine ( VCFHeaderLine snpEffCommandLine ) {
|
||||
if ( snpEffCommandLine == null || snpEffCommandLine.getValue() == null || snpEffCommandLine.getValue().trim().length() == 0 ) {
|
||||
throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY + " entry in the VCF header for the SnpEff " +
|
||||
"input file, which should be added by all supported versions of SnpEff (" +
|
||||
Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSupportedSnpEffVersion ( String versionString ) {
|
||||
for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) {
|
||||
if ( supportedVersion.equals(versionString) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private VariantContext getMatchingSnpEffRecord ( List<VariantContext> snpEffRecords, VariantContext vc ) {
|
||||
for ( VariantContext snpEffRecord : snpEffRecords ) {
|
||||
if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) {
|
||||
return snpEffRecord;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<SnpEffEffect> parseSnpEffRecord ( VariantContext snpEffRecord ) {
|
||||
List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>();
|
||||
|
||||
Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
|
||||
if ( effectFieldValue == null ) {
|
||||
return parsedEffects;
|
||||
}
|
||||
|
||||
// The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a String.
|
||||
// We can have either in the case of SnpEff, since there may be one or more than one effect in this record.
|
||||
List<String> individualEffects;
|
||||
if ( effectFieldValue instanceof List ) {
|
||||
individualEffects = (List<String>)effectFieldValue;
|
||||
}
|
||||
else {
|
||||
individualEffects = Arrays.asList((String)effectFieldValue);
|
||||
}
|
||||
|
||||
for ( String effectString : individualEffects ) {
|
||||
String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);
|
||||
|
||||
if ( effectNameAndMetadata.length != 2 ) {
|
||||
logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s",
|
||||
snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
|
||||
continue;
|
||||
}
|
||||
|
||||
String effectName = effectNameAndMetadata[0];
|
||||
String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);
|
||||
|
||||
SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);
|
||||
|
||||
if ( parsedEffect.isWellFormed() ) {
|
||||
parsedEffects.add(parsedEffect);
|
||||
}
|
||||
else {
|
||||
logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
|
||||
snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString));
|
||||
}
|
||||
}
|
||||
|
||||
return parsedEffects;
|
||||
}
|
||||
|
||||
private SnpEffEffect getMostSignificantEffect ( List<SnpEffEffect> effects ) {
|
||||
SnpEffEffect mostSignificantEffect = null;
|
||||
|
||||
for ( SnpEffEffect effect : effects ) {
|
||||
if ( mostSignificantEffect == null ||
|
||||
snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
|
||||
effect.isHigherImpactThan(mostSignificantEffect) ) {
|
||||
|
||||
mostSignificantEffect = snpEffFeature;
|
||||
mostSignificantEffect = effect;
|
||||
}
|
||||
}
|
||||
|
||||
return mostSignificantEffect;
|
||||
}
|
||||
|
||||
private Map<String, Object> generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
|
||||
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(getKeyNames().size()));
|
||||
|
||||
if ( mostSignificantEffect.hasGeneID() )
|
||||
annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
|
||||
if ( mostSignificantEffect.hasGeneName() )
|
||||
annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
|
||||
if ( mostSignificantEffect.hasTranscriptID() )
|
||||
annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
|
||||
if ( mostSignificantEffect.hasExonID() )
|
||||
annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
|
||||
if ( mostSignificantEffect.hasExonRank() )
|
||||
annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
|
||||
if ( mostSignificantEffect.isNonCodingGene() )
|
||||
annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
|
||||
|
||||
annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
|
||||
annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
|
||||
if ( mostSignificantEffect.hasEffectExtraInformation() )
|
||||
annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
|
||||
|
||||
if ( mostSignificantEffect.hasOldAndNewAA() )
|
||||
annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
|
||||
if ( mostSignificantEffect.hasOldAndNewCodon() )
|
||||
annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
|
||||
if ( mostSignificantEffect.hasCodonNum() )
|
||||
annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
|
||||
if ( mostSignificantEffect.hasCdsSize() )
|
||||
annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList( GENE_ID_KEY,
|
||||
GENE_NAME_KEY,
|
||||
TRANSCRIPT_ID_KEY,
|
||||
EXON_ID_KEY,
|
||||
EXON_RANK_KEY,
|
||||
WITHIN_NON_CODING_GENE_KEY,
|
||||
EFFECT_KEY,
|
||||
EFFECT_IMPACT_KEY,
|
||||
EFFECT_EXTRA_INFORMATION_KEY,
|
||||
OLD_NEW_AA_KEY,
|
||||
OLD_NEW_CODON_KEY,
|
||||
CODON_NUM_KEY,
|
||||
CDS_SIZE_KEY
|
||||
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
|
||||
InfoFieldKey.IMPACT_KEY.getKeyName(),
|
||||
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
|
||||
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()
|
||||
);
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(
|
||||
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
|
||||
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values()))
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class to parse, validate, and store a single SnpEff effect and its metadata.
|
||||
*/
|
||||
protected static class SnpEffEffect {
|
||||
private EffectType effect;
|
||||
private EffectImpact impact;
|
||||
private String codonChange;
|
||||
private String aminoAcidChange;
|
||||
private String geneName;
|
||||
private String geneBiotype;
|
||||
private EffectCoding coding;
|
||||
private String transcriptID;
|
||||
private String exonID;
|
||||
|
||||
private String parseError = null;
|
||||
private boolean isWellFormed = true;
|
||||
|
||||
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
|
||||
|
||||
// Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
|
||||
// errors come after warnings, not vice versa:
|
||||
private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
|
||||
private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
|
||||
|
||||
private static final int SNPEFF_CODING_FIELD_INDEX = 5;
|
||||
|
||||
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
|
||||
parseEffectName(effectName);
|
||||
parseEffectMetadata(effectMetadata);
|
||||
}
|
||||
|
||||
private void parseEffectName ( String effectName ) {
|
||||
try {
|
||||
effect = EffectType.valueOf(effectName);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("%s is not a recognized effect type", effectName));
|
||||
}
|
||||
}
|
||||
|
||||
private void parseEffectMetadata ( String[] effectMetadata ) {
|
||||
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
|
||||
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
|
||||
parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
|
||||
}
|
||||
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
|
||||
}
|
||||
else {
|
||||
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
|
||||
EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if ( effect != null && effect.isModifier() ) {
|
||||
impact = EffectImpact.MODIFIER;
|
||||
}
|
||||
else {
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
}
|
||||
}
|
||||
|
||||
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
|
||||
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
|
||||
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
|
||||
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
|
||||
|
||||
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
|
||||
try {
|
||||
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX]));
|
||||
}
|
||||
}
|
||||
else {
|
||||
coding = EffectCoding.UNKNOWN;
|
||||
}
|
||||
|
||||
transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
|
||||
exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
|
||||
}
|
||||
|
||||
private void parseError ( String message ) {
|
||||
isWellFormed = false;
|
||||
|
||||
// Cache only the first error encountered:
|
||||
if ( parseError == null ) {
|
||||
parseError = message;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isWellFormed() {
|
||||
return isWellFormed;
|
||||
}
|
||||
|
||||
public String getParseError() {
|
||||
return parseError == null ? "" : parseError;
|
||||
}
|
||||
|
||||
public boolean isCoding() {
|
||||
return coding == EffectCoding.CODING;
|
||||
}
|
||||
|
||||
public boolean isHigherImpactThan ( SnpEffEffect other ) {
|
||||
// If one effect is within a coding gene and the other is not, the effect that is
|
||||
// within the coding gene has higher impact:
|
||||
|
||||
if ( isCoding() && ! other.isCoding() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( ! isCoding() && other.isCoding() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Otherwise, both effects are either in or not in a coding gene, so we compare the impacts
|
||||
// of the effects themselves. Effects with the same impact are tie-broken using the
|
||||
// functional class of the effect:
|
||||
|
||||
if ( impact.isHigherImpactThan(other.impact) ) {
|
||||
return true;
|
||||
}
|
||||
else if ( impact.isSameImpactAs(other.impact) ) {
|
||||
return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public Map<String, Object> getAnnotations() {
|
||||
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(InfoFieldKey.values().length));
|
||||
|
||||
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
|
||||
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
|
||||
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
|
||||
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
|
||||
addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString());
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
||||
private void addAnnotation ( Map<String, Object> annotations, String keyName, String keyValue ) {
|
||||
// Only add annotations for keys associated with non-empty values:
|
||||
if ( keyValue != null && keyValue.trim().length() > 0 ) {
|
||||
annotations.put(keyName, keyValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Fraction of reads containing spanning deletions at this site.
|
||||
*/
|
||||
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -19,12 +20,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 6/29/11
|
||||
* Time: 3:14 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* Counts of bases from SLX, 454, and SOLiD at this site
|
||||
*/
|
||||
@Hidden
|
||||
public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
private String nSLX = "NumSLX";
|
||||
private String n454 ="Num454";
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
|
@ -86,14 +85,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
|
||||
|
||||
/**
|
||||
* The INFO field will be annotated with information on the most biologically-significant effect
|
||||
* listed in the SnpEff output file for each variant.
|
||||
*/
|
||||
@Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
|
||||
public RodBinding<SnpEffFeature> snpEffFile;
|
||||
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
|
||||
public RodBinding<VariantContext> snpEffFile;
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return snpEffFile; }
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
|
|
@ -162,6 +162,12 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
|
||||
protected boolean indelsOnly = false;
|
||||
|
||||
@Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation")
|
||||
public String familyStr = null;
|
||||
|
||||
@Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio")
|
||||
public double minGenotypeQualityP = 0.0;
|
||||
|
||||
private VariantAnnotatorEngine engine;
|
||||
|
||||
private Collection<VariantContext> indelBufferContext;
|
||||
|
|
@ -203,9 +209,9 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
}
|
||||
|
||||
if ( USE_ALL_ANNOTATIONS )
|
||||
engine = new VariantAnnotatorEngine(this);
|
||||
engine = new VariantAnnotatorEngine(this, getToolkit());
|
||||
else
|
||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
|
||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit());
|
||||
engine.initializeExpressions(expressionsToUse);
|
||||
|
||||
// setup the header fields
|
||||
|
|
@ -217,6 +223,8 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
hInfo.add(line);
|
||||
}
|
||||
|
||||
engine.invokeAnnotationInitializationMethods(hInfo);
|
||||
|
||||
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
|
|
|
|||
|
|
@ -26,13 +26,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
|
@ -49,6 +47,7 @@ public class VariantAnnotatorEngine {
|
|||
|
||||
private HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
|
||||
private AnnotatorCompatibleWalker walker;
|
||||
private GenomeAnalysisEngine toolkit;
|
||||
|
||||
private static class VAExpression {
|
||||
|
||||
|
|
@ -74,16 +73,18 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
|
||||
// use this constructor if you want all possible annotations
|
||||
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) {
|
||||
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
|
||||
this.walker = walker;
|
||||
this.toolkit = toolkit;
|
||||
requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
|
||||
requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
|
||||
initializeDBs();
|
||||
}
|
||||
|
||||
// use this constructor if you want to select specific annotations (and/or interfaces)
|
||||
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker) {
|
||||
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
|
||||
this.walker = walker;
|
||||
this.toolkit = toolkit;
|
||||
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||
initializeDBs();
|
||||
}
|
||||
|
|
@ -113,6 +114,16 @@ public class VariantAnnotatorEngine {
|
|||
dbAnnotations.put(rod, rod.getName());
|
||||
}
|
||||
|
||||
public void invokeAnnotationInitializationMethods( Set<VCFHeaderLine> headerLines ) {
|
||||
for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
|
||||
annotation.initialize(walker, toolkit, headerLines);
|
||||
}
|
||||
|
||||
for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
|
||||
annotation.initialize(walker, toolkit, headerLines);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
|
||||
|
||||
Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
|
||||
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.List;
|
||||
|
|
@ -9,8 +8,9 @@ import java.util.List;
|
|||
public interface AnnotatorCompatibleWalker {
|
||||
|
||||
// getter methods for various used bindings
|
||||
public abstract RodBinding<SnpEffFeature> getSnpEffRodBinding();
|
||||
public abstract RodBinding<VariantContext> getVariantRodBinding();
|
||||
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
|
||||
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
|
||||
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
|
||||
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,18 +24,18 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
|
||||
public abstract class VariantAnnotatorAnnotation {
|
||||
// return the INFO keys
|
||||
public abstract List<String> getKeyNames();
|
||||
|
||||
// initialization method (optional for subclasses, and therefore non-abstract)
|
||||
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) { }
|
||||
}
|
||||
|
|
@ -175,21 +175,16 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2);
|
||||
// ignore places where we don't have a variant
|
||||
if ( beagleR2Feature == null )
|
||||
return 0;
|
||||
|
||||
|
||||
BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs);
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
if ( beagleProbsFeature == null )
|
||||
return 0;
|
||||
|
||||
BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased);
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
if ( beaglePhasedFeature == null )
|
||||
return 0;
|
||||
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
|
||||
{
|
||||
vcfWriter.add(vc_input);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
// get reference base for current position
|
||||
byte refByte = ref.getBase();
|
||||
|
|
|
|||
|
|
@ -63,20 +63,32 @@ import java.util.*;
|
|||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more bam files (with proper headers) to be analyzed for coverage statistics
|
||||
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
*(Optional) A REFSEQ Rod to aggregate coverage to the gene level
|
||||
* <p>
|
||||
* (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
|
||||
*</p></p>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
|
||||
* </p><p>
|
||||
* - no suffix: per locus coverage
|
||||
* </p><p>
|
||||
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
|
||||
* </p><p>
|
||||
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
|
||||
* </p><p>
|
||||
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
|
||||
* </p><p>
|
||||
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
|
||||
* </p><p>
|
||||
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
|
||||
* </p><p>
|
||||
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
|
||||
* </p><p>
|
||||
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
|
||||
* </p><p>
|
||||
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
|
||||
* </p>
|
||||
*
|
||||
|
|
@ -84,7 +96,7 @@ import java.util.*;
|
|||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantEval \
|
||||
* -T DepthOfCoverage \
|
||||
* -o file_name_base \
|
||||
* -I input_bams.list
|
||||
* [-geneList refSeq.sorted.txt] \
|
||||
|
|
|
|||
|
|
@ -43,8 +43,10 @@ import java.util.List;
|
|||
* Generates an alternative reference sequence over the specified interval.
|
||||
*
|
||||
* <p>
|
||||
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
|
||||
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
|
||||
* Note that if there are multiple variants at a site, it takes the first one seen.
|
||||
* Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
|
|
|
|||
|
|
@ -42,6 +42,9 @@ import java.io.PrintStream;
|
|||
*
|
||||
* <p>
|
||||
* The output format can be partially controlled using the provided command-line arguments.
|
||||
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
|
||||
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
|
||||
* separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.genotype;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
|
|
@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
* Time: 6:46:09 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public enum DiploidGenotype {
|
||||
enum DiploidGenotype {
|
||||
AA ('A', 'A'),
|
||||
AC ('A', 'C'),
|
||||
AG ('A', 'G'),
|
||||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.FragmentPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -276,8 +275,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
if ( elt.isReducedRead() ) {
|
||||
// reduced read representation
|
||||
byte qual = elt.getReducedQual();
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getReducedCount(); // we added nObs bases here
|
||||
if ( BaseUtils.isRegularBase( elt.getBase() )) {
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getReducedCount(); // we added nObs bases here
|
||||
} else // odd bases or deletions => don't use them
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
|
|
|||
|
|
@ -48,27 +48,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// code for testing purposes
|
||||
//
|
||||
private final static boolean DEBUG = false;
|
||||
private final static boolean PRINT_LIKELIHOODS = false;
|
||||
private final static int N_CYCLES = 1;
|
||||
private SimpleTimer timerExpt = new SimpleTimer("linearExactBanded");
|
||||
private SimpleTimer timerGS = new SimpleTimer("linearExactGS");
|
||||
private final static boolean COMPARE_TO_GS = false;
|
||||
|
||||
public enum ExactCalculation {
|
||||
N2_GOLD_STANDARD,
|
||||
LINEAR_EXPERIMENTAL
|
||||
}
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
|
||||
private boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
|
||||
|
||||
|
||||
final private ExactCalculation calcToUse;
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
calcToUse = UAC.EXACT_CALCULATION_TYPE;
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
|
|
@ -76,43 +61,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
Map<String, Genotype> GLs, Set<Allele>alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
double[] gsPosteriors;
|
||||
if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here
|
||||
gsPosteriors = log10AlleleFrequencyPosteriors.clone();
|
||||
|
||||
int idxAA = GenotypeType.AA.ordinal();
|
||||
int idxAB = GenotypeType.AB.ordinal();
|
||||
int idxBB = GenotypeType.BB.ordinal();
|
||||
|
||||
// todo -- remove me after testing
|
||||
if ( N_CYCLES > 1 ) {
|
||||
for ( int i = 0; i < N_CYCLES; i++) {
|
||||
timerGS.restart();
|
||||
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB);
|
||||
timerGS.stop();
|
||||
|
||||
timerExpt.restart();
|
||||
linearExactBanded(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
|
||||
timerExpt.stop();
|
||||
}
|
||||
|
||||
System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n",
|
||||
timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime());
|
||||
}
|
||||
|
||||
int lastK = -1;
|
||||
|
||||
int numAlleles = alleles.size();
|
||||
final int numAlleles = alleles.size();
|
||||
final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null;
|
||||
final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null;
|
||||
|
||||
int idxDiag = numAlleles;
|
||||
int incr = numAlleles - 1;
|
||||
|
||||
double[][] posteriorCache = new double[numAlleles-1][];
|
||||
double[] bestAFguess = new double[numAlleles-1];
|
||||
|
||||
for (int k=1; k < numAlleles; k++) {
|
||||
// multi-allelic approximation, part 1: Ideally
|
||||
// for each alt allele compute marginal (suboptimal) posteriors -
|
||||
|
|
@ -121,24 +75,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
|
||||
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
|
||||
|
||||
idxAA = 0;
|
||||
idxAB = k;
|
||||
final int idxAA = 0;
|
||||
final int idxAB = k;
|
||||
// yy is always element on the diagonal.
|
||||
// 2 alleles: BBelement 2
|
||||
// 3 alleles: BB element 3. CC element 5
|
||||
// 4 alleles:
|
||||
idxBB = idxDiag;
|
||||
final int idxBB = idxDiag;
|
||||
idxDiag += incr--;
|
||||
|
||||
// todo - possible cleanup
|
||||
switch ( calcToUse ) {
|
||||
case N2_GOLD_STANDARD:
|
||||
lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
break;
|
||||
case LINEAR_EXPERIMENTAL:
|
||||
lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
break;
|
||||
}
|
||||
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
|
||||
if (numAlleles > 2) {
|
||||
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
|
||||
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
|
||||
|
|
@ -153,47 +100,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
|
||||
|
||||
}
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
if ( COMPARE_TO_GS ) {
|
||||
gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB);
|
||||
|
||||
double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]);
|
||||
double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]);
|
||||
boolean eq = (log10thisPVar == Double.NEGATIVE_INFINITY && log10gsPVar == Double.NEGATIVE_INFINITY) || MathUtils.compareDoubles(log10thisPVar, log10gsPVar, 1e-4) == 0;
|
||||
|
||||
if ( ! eq || PRINT_LIKELIHOODS ) {
|
||||
System.out.printf("----------------------------------------%n");
|
||||
for (int k=0; k < log10AlleleFrequencyPosteriors.length; k++) {
|
||||
double x = log10AlleleFrequencyPosteriors[k];
|
||||
System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k,
|
||||
x < -1e10 ? Double.NEGATIVE_INFINITY : x, gsPosteriors[k],
|
||||
log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]);
|
||||
}
|
||||
System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n",
|
||||
ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final double[][] getGLs(Map<String, Genotype> GLs) {
|
||||
double[][] genotypeLikelihoods = new double[GLs.size()+1][];
|
||||
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
||||
|
||||
int j = 0;
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.values() ) {
|
||||
j++;
|
||||
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
|
||||
genotypeLikelihoods.add(gls);
|
||||
}
|
||||
}
|
||||
|
||||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Linearized, ~O(N), implementation.
|
||||
|
|
@ -237,90 +162,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
// now with banding
|
||||
public int linearExactBanded(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
throw new NotImplementedException();
|
||||
// final int numSamples = GLs.size();
|
||||
// final int numChr = 2*numSamples;
|
||||
// final double[][] genotypeLikelihoods = getGLs(GLs);
|
||||
//
|
||||
// final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
// logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
//
|
||||
// double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
// boolean done = false;
|
||||
// int lastK = -1;
|
||||
// final int BAND_SIZE = 10;
|
||||
//
|
||||
// for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
// final double[] kMinus0 = logY.getkMinus0();
|
||||
// int jStart = Math.max(k - BAND_SIZE, 1);
|
||||
// int jStop = Math.min(k + BAND_SIZE, numSamples);
|
||||
//
|
||||
// if ( k == 0 ) { // special case for k = 0
|
||||
// for ( int j=1; j <= numSamples; j++ ) {
|
||||
// kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
|
||||
// }
|
||||
// } else { // k > 0
|
||||
// final double[] kMinus1 = logY.getkMinus1();
|
||||
// final double[] kMinus2 = logY.getkMinus2();
|
||||
// Arrays.fill(kMinus0,0);
|
||||
//
|
||||
// for ( int j = jStart; j <= jStop; j++ ) {
|
||||
// final double[] gl = genotypeLikelihoods[j];
|
||||
// final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
|
||||
//
|
||||
// double aa = Double.NEGATIVE_INFINITY;
|
||||
// double ab = Double.NEGATIVE_INFINITY;
|
||||
// if (k < 2*j-1)
|
||||
// aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
|
||||
//
|
||||
// if (k < 2*j)
|
||||
// ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
|
||||
//
|
||||
// double log10Max;
|
||||
// if (k > 1) {
|
||||
// final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
|
||||
// log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
// } else {
|
||||
// // we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
// log10Max = approximateLog10SumLog10(aa, ab);
|
||||
// }
|
||||
//
|
||||
// // finally, update the L(j,k) value
|
||||
// kMinus0[j] = log10Max - logDenominator;
|
||||
//
|
||||
// String offset = Utils.dupString(' ',k);
|
||||
// System.out.printf("%s%3d %3d %.2f%n", offset, k, j, kMinus0[j]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // update the posteriors vector
|
||||
// final double log10LofK = kMinus0[jStop];
|
||||
// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
//
|
||||
// // can we abort early?
|
||||
// lastK = k;
|
||||
// maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
// if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
// done = true;
|
||||
// }
|
||||
//
|
||||
// logY.rotate();
|
||||
// }
|
||||
//
|
||||
// return lastK;
|
||||
}
|
||||
|
||||
public int linearExact(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
final int numSamples = GLs.size();
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
final double[][] genotypeLikelihoods = getGLs(GLs);
|
||||
|
||||
final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
|
|
@ -334,14 +181,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
final double[] kMinus2 = logY.getkMinus2();
|
||||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods[j];
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
|
|
@ -434,10 +281,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if ( !vc.isVariant() )
|
||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
boolean multiAllelicRecord = false;
|
||||
|
||||
if (vc.getAlternateAlleles().size() > 1)
|
||||
multiAllelicRecord = true;
|
||||
|
||||
Map<String, Genotype> GLs = vc.getGenotypes();
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
|
|
@ -454,7 +297,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
pathMetricArray[0][0] = 0.0;
|
||||
|
||||
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
||||
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
sampleIndices.addAll(GLs.keySet());
|
||||
sampleIdx = GLs.size();
|
||||
}
|
||||
|
|
@ -465,6 +308,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
continue;
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
||||
//System.out.print(sample.getKey()+":");
|
||||
//for (int k=0; k < likelihoods.length; k++)
|
||||
// System.out.format("%4.2f ",likelihoods[k]);
|
||||
//System.out.println();
|
||||
// all likelihoods are essentially the same: skip this sample and will later on force no call.
|
||||
//sampleIdx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
sampleIndices.add(sample.getKey());
|
||||
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||
|
|
@ -504,22 +358,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
Genotype g = GLs.get(sample);
|
||||
if ( !g.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
|
||||
// and will add no-call genotype to GL's in a second pass
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Double.NEGATIVE_INFINITY;
|
||||
double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
||||
}
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];;
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
/* System.out.format("Sample: %s GL:",sample);
|
||||
for (int i=0; i < likelihoods.length; i++)
|
||||
System.out.format("%1.4f ",likelihoods[i]);
|
||||
System.out.format("%1.4f, ",likelihoods[i]);
|
||||
*/
|
||||
|
||||
for (int i=0; i < likelihoods.length; i++) {
|
||||
|
|
@ -570,83 +427,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Gold standard, but O(N^2), implementation.
|
||||
//
|
||||
// TODO -- remove me for clarity in this code
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
int numSamples = GLs.size();
|
||||
int numChr = 2*numSamples;
|
||||
|
||||
double[][] logYMatrix = new double[1+numSamples][1+numChr];
|
||||
|
||||
for (int i=0; i <=numSamples; i++)
|
||||
for (int j=0; j <=numChr; j++)
|
||||
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
//YMatrix[0][0] = 1.0;
|
||||
logYMatrix[0][0] = 0.0;
|
||||
int j=0;
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
j++;
|
||||
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
continue;
|
||||
Genotype g = GLs.get(sample.getKey());
|
||||
|
||||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
|
||||
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
|
||||
// special treatment for k=0: iteration reduces to:
|
||||
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA];
|
||||
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
||||
continue; // regular likelihoods
|
||||
|
||||
for (int k=1; k <= 2*j; k++ ) {
|
||||
|
||||
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
double logNumerator[];
|
||||
logNumerator = new double[3];
|
||||
if (k < 2*j-1)
|
||||
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
|
||||
genotypeLikelihoods[idxAA];
|
||||
else
|
||||
logNumerator[0] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
|
||||
if (k < 2*j)
|
||||
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
|
||||
genotypeLikelihoods[idxAB];
|
||||
else
|
||||
logNumerator[1] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if (k > 1)
|
||||
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
|
||||
genotypeLikelihoods[idxBB];
|
||||
else
|
||||
logNumerator[2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
double logNum = MathUtils.softMax(logNumerator);
|
||||
|
||||
//YMatrix[j][k] = num/den;
|
||||
logYMatrix[j][k] = logNum - logDenominator;
|
||||
}
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Genotype.NO_NEG_LOG_10PERROR;
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
for (int k=0; k <= numChr; k++)
|
||||
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
|
||||
|
||||
return numChr;
|
||||
return calls;
|
||||
}
|
||||
|
||||
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
|
||||
|
|
@ -657,5 +457,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,10 +32,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
|
|
@ -70,9 +71,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
// gdebug removeme
|
||||
// todo -cleanup
|
||||
private HaplotypeIndelErrorModel model;
|
||||
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
|
||||
//
|
||||
private GenomeLoc lastSiteVisited;
|
||||
private ArrayList<Allele> alleleList;
|
||||
|
||||
|
|
@ -83,26 +81,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
if (UAC.GSA_PRODUCTION_ONLY == false) {
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
|
||||
useOldWrongHorribleHackedUpLikelihoodModel = false;
|
||||
}
|
||||
else {
|
||||
useOldWrongHorribleHackedUpLikelihoodModel = true;
|
||||
double INSERTION_START_PROBABILITY = 1e-3;
|
||||
|
||||
double INSERTION_END_PROBABILITY = 0.5;
|
||||
|
||||
double ALPHA_DELETION_PROBABILITY = 1e-3;
|
||||
|
||||
|
||||
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
|
||||
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
|
||||
}
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,UAC.OUTPUT_DEBUG_INDEL_INFO);
|
||||
alleleList = new ArrayList<Allele>();
|
||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
|
|
@ -321,7 +300,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
haplotypeMap.clear();
|
||||
|
||||
if (getAlleleListFromVCF) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles) ) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
|
|
@ -382,20 +361,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
}
|
||||
}
|
||||
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
|
||||
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
|
||||
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
|
||||
|
||||
if (useOldWrongHorribleHackedUpLikelihoodModel) {
|
||||
numPrefBases = 20;
|
||||
hsize=80;
|
||||
}
|
||||
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
|
||||
final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
|
||||
final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
|
||||
|
||||
if (DEBUG)
|
||||
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
|
||||
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
|
||||
//System.out.println(eventLength);
|
||||
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||
ref, hsize, numPrefBases);
|
||||
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
|
||||
ref, hsize, numPrefBases);
|
||||
|
||||
// For each sample, get genotype likelihoods based on pileup
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
|
|
@ -412,17 +388,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null ) {
|
||||
double[] genotypeLikelihoods;
|
||||
if (useOldWrongHorribleHackedUpLikelihoodModel)
|
||||
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
|
||||
else
|
||||
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
|
||||
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
|
||||
|
||||
|
||||
|
||||
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
|
||||
// for 3 alleles it's 00 01 11 02 12 22
|
||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
alleleList,
|
||||
genotypeLikelihoods,
|
||||
getFilteredDepth(pileup)));
|
||||
|
|
@ -444,4 +412,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
return indelLikelihoodMap.get();
|
||||
}
|
||||
|
||||
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
|
||||
// so that per-sample DP will include deletions covering the event.
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -26,16 +26,14 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -58,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
}
|
||||
|
||||
public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
if ( tracker == null || ref == null || logger == null )
|
||||
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
|
||||
VariantContext vc = null;
|
||||
|
||||
// search for usable record
|
||||
for( final VariantContext vc_input : tracker.getValues(allelesBinding) ) {
|
||||
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
|
||||
if ( vc == null ) {
|
||||
vc = vc_input;
|
||||
} else {
|
||||
logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
|
|
@ -96,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
if ( alternateAlleleToUse != null ) {
|
||||
bestAlternateAllele = alternateAlleleToUse.getBases()[0];
|
||||
} else if ( useAlleleFromVCF ) {
|
||||
VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger, UAC.alleles);
|
||||
VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
|
|
@ -143,8 +122,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
aList.add(refAllele);
|
||||
aList.add(altAllele);
|
||||
double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ;
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
aList, dlike, getFilteredDepth(pileup)));
|
||||
aList, MathUtils.normalizeFromLog10(dlike, false, true), getFilteredDepth(pileup)));
|
||||
}
|
||||
|
||||
return refAllele;
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
|
|
|||
|
|
@ -143,35 +143,21 @@ public class UnifiedArgumentCollection {
|
|||
@Hidden
|
||||
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
|
||||
public int INDEL_HAPLOTYPE_SIZE = 80;
|
||||
@Hidden
|
||||
@Argument(fullName = "doContextDependentGapPenalties", shortName = "doCDP", doc = "Vary gap penalties by context", required = false)
|
||||
public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true;
|
||||
|
||||
//gdebug+
|
||||
// experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!!
|
||||
@Hidden
|
||||
@Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
|
||||
public boolean GET_GAP_PENALTIES_FROM_DATA = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
|
||||
public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
|
||||
// @Hidden
|
||||
// @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
|
||||
// public boolean GET_GAP_PENALTIES_FROM_DATA = false;
|
||||
//
|
||||
// @Hidden
|
||||
// @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
|
||||
// public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
|
||||
public boolean dovit = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
|
||||
public boolean GSA_PRODUCTION_ONLY = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
|
||||
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
|
||||
public boolean IGNORE_SNP_ALLELES = false;
|
||||
|
|
@ -191,7 +177,6 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.AFmodel = AFmodel;
|
||||
uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.PCR_error = PCR_error;
|
||||
uac.GenotypingMode = GenotypingMode;
|
||||
|
|
@ -209,15 +194,10 @@ public class UnifiedArgumentCollection {
|
|||
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
|
||||
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
|
||||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES;
|
||||
uac.alleles = alleles;
|
||||
|
||||
uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA;
|
||||
uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE;
|
||||
// todo- arguments to remove
|
||||
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
|
||||
uac.dovit = dovit;
|
||||
uac.GSA_PRODUCTION_ONLY = GSA_PRODUCTION_ONLY;
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
|
||||
return uac;
|
||||
|
|
|
|||
|
|
@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -127,7 +126,8 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
|
||||
|
|
@ -210,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
|
||||
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this);
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this, getToolkit());
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
|
||||
|
||||
// initialize the header
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
|
|
@ -36,13 +37,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -236,10 +235,11 @@ public class UnifiedGenotyperEngine {
|
|||
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
|
||||
VariantContext vc;
|
||||
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles);
|
||||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return null;
|
||||
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles());
|
||||
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase());
|
||||
|
||||
} else {
|
||||
// deal with bad/non-standard reference bases
|
||||
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
||||
|
|
@ -544,6 +544,21 @@ public class UnifiedGenotyperEngine {
|
|||
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
}
|
||||
|
||||
private final static double[] binomialProbabilityDepthCache = new double[10000];
|
||||
static {
|
||||
for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) {
|
||||
binomialProbabilityDepthCache[i] = MathUtils.binomialProbability(0, i, 0.5);
|
||||
}
|
||||
}
|
||||
|
||||
private final double getRefBinomialProb(final int depth) {
|
||||
if ( depth < binomialProbabilityDepthCache.length )
|
||||
return binomialProbabilityDepthCache[depth];
|
||||
else
|
||||
return MathUtils.binomialProbability(0, depth, 0.5);
|
||||
}
|
||||
|
||||
|
||||
private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
|
@ -567,7 +582,7 @@ public class UnifiedGenotyperEngine {
|
|||
depth = context.getExtendedEventPileup().size();
|
||||
}
|
||||
|
||||
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||
P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth);
|
||||
}
|
||||
|
||||
return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false);
|
||||
|
|
@ -635,7 +650,7 @@ public class UnifiedGenotyperEngine {
|
|||
// no extended event pileup
|
||||
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
|
||||
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
|
||||
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger, UAC.alleles);
|
||||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if (vcInput == null)
|
||||
return null;
|
||||
|
||||
|
|
@ -741,4 +756,23 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
return afcm;
|
||||
}
|
||||
|
||||
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
if ( tracker == null || ref == null || logger == null )
|
||||
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
|
||||
VariantContext vc = null;
|
||||
|
||||
// search for usable record
|
||||
for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) {
|
||||
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
|
||||
if ( vc == null ) {
|
||||
vc = vc_input;
|
||||
} else {
|
||||
logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,9 +26,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
|
@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel {
|
|||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||
double baseProb = QualityUtils.qualToProb(k);
|
||||
double baseProb = QualityUtils.qualToProb((byte)k);
|
||||
|
||||
|
||||
baseMatchArray[k] = probToQual(baseProb);
|
||||
|
|
|
|||
|
|
@ -28,9 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -50,36 +51,8 @@ import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Reca
|
|||
|
||||
|
||||
public class PairHMMIndelErrorModel {
|
||||
|
||||
|
||||
public static final int BASE_QUAL_THRESHOLD = 20;
|
||||
|
||||
|
||||
private static final int MATCH_OFFSET = 0;
|
||||
private static final int X_OFFSET = 1;
|
||||
private static final int Y_OFFSET = 2;
|
||||
|
||||
private static final int DIAG = 0;
|
||||
private static final int UP = 1;
|
||||
private static final int LEFT = 2;
|
||||
|
||||
private static final int DIAG_GOTO_M = 0;
|
||||
private static final int DIAG_GOTO_X = 1;
|
||||
private static final int DIAG_GOTO_Y = 2;
|
||||
|
||||
private static final int UP_GOTO_M = 4;
|
||||
private static final int UP_GOTO_X = 5;
|
||||
private static final int UP_GOTO_Y = 6;
|
||||
|
||||
private static final int LEFT_GOTO_M = 8;
|
||||
private static final int LEFT_GOTO_X = 9;
|
||||
private static final int LEFT_GOTO_Y = 10;
|
||||
|
||||
private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y};
|
||||
private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y};
|
||||
private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y};
|
||||
|
||||
|
||||
private final double logGapOpenProbability;
|
||||
private final double logGapContinuationProbability;
|
||||
|
||||
|
|
@ -100,36 +73,13 @@ public class PairHMMIndelErrorModel {
|
|||
private static final double MIN_GAP_CONT_PENALTY = 10.0;
|
||||
private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this.
|
||||
|
||||
|
||||
private boolean doViterbi = false;
|
||||
|
||||
private final boolean useAffineGapModel = true;
|
||||
private boolean doContextDependentPenalties = false;
|
||||
|
||||
private final double[] GAP_OPEN_PROB_TABLE;
|
||||
private final double[] GAP_CONT_PROB_TABLE;
|
||||
|
||||
private boolean getGapPenaltiesFromFile = false;
|
||||
|
||||
private int SMOOTHING = 1;
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
//copy+
|
||||
/* private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
protected static final String EOF_MARKER = "EOF";
|
||||
private long numReadsWithMalformedColorSpace = 0;
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
*/
|
||||
//copy-
|
||||
|
||||
static {
|
||||
LOG_ONE_HALF= -Math.log10(2.0);
|
||||
END_GAP_COST = LOG_ONE_HALF;
|
||||
|
|
@ -145,141 +95,9 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit,boolean gpf, File RECAL_FILE) {
|
||||
|
||||
this(indelGOP, indelGCP, deb, doCDP, dovit);
|
||||
this.getGapPenaltiesFromFile = gpf;
|
||||
|
||||
// read data from recal file
|
||||
// gdebug - start copy from TableRecalibrationWalker
|
||||
/* if (gpf) {
|
||||
boolean sawEOF = false;
|
||||
boolean REQUIRE_EOF = false;
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
if ( REQUIRE_EOF )
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
|
||||
}
|
||||
// debug end copy
|
||||
*/
|
||||
}
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
*/
|
||||
/*
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
}
|
||||
|
||||
*/
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) {
|
||||
this(indelGOP, indelGCP, deb, doCDP);
|
||||
this.doViterbi = dovit;
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
|
||||
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) {
|
||||
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
|
||||
this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob
|
||||
this.doContextDependentPenalties = doCDP;
|
||||
this.DEBUG = deb;
|
||||
|
||||
|
||||
|
|
@ -313,132 +131,6 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
private double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
|
||||
final int X_METRIC_LENGTH = readBases.length+1;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int i=1; i < X_METRIC_LENGTH; i++) {
|
||||
pathMetricArray[i][0] = 0;
|
||||
bestMetricArray[i][0] = UP;
|
||||
}
|
||||
|
||||
for (int j=1; j < Y_METRIC_LENGTH; j++) {
|
||||
pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
|
||||
bestMetricArray[0][j] = LEFT;
|
||||
}
|
||||
|
||||
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
|
||||
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
|
||||
|
||||
byte x = readBases[indI-1];
|
||||
byte y = haplotypeBases[indJ-1];
|
||||
byte qual = readQuals[indI-1];
|
||||
|
||||
double bestMetric = 0.0;
|
||||
int bestMetricIdx = 0;
|
||||
|
||||
// compute metric for match/mismatch
|
||||
// workaround for reads whose bases quality = 0,
|
||||
if (qual < 1)
|
||||
qual = 1;
|
||||
|
||||
if (qual > MAX_CACHED_QUAL)
|
||||
qual = MAX_CACHED_QUAL;
|
||||
|
||||
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
|
||||
double[] metrics = new double[3];
|
||||
|
||||
metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead;
|
||||
metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
|
||||
metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
|
||||
|
||||
if (doViterbi) {
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
|
||||
pathMetricArray[indI][indJ] = bestMetric;
|
||||
bestMetricArray[indI][indJ] = bestMetricIdx;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
double bestMetric=0.0;
|
||||
int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
|
||||
|
||||
for (int i=0; i < X_METRIC_LENGTH; i ++ ) {
|
||||
int j= Y_METRIC_LENGTH-1;
|
||||
|
||||
if (pathMetricArray[i][j] > bestMetric) {
|
||||
bestMetric = pathMetricArray[i][j];
|
||||
bestI = i;
|
||||
bestJ = j;
|
||||
}
|
||||
}
|
||||
for (int j=0; j < Y_METRIC_LENGTH; j++ ) {
|
||||
int i= X_METRIC_LENGTH-1;
|
||||
if (pathMetricArray[i][j] >= bestMetric) {
|
||||
bestMetric = pathMetricArray[i][j];
|
||||
bestI = i;
|
||||
bestJ = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (DEBUG && doViterbi) {
|
||||
|
||||
String haplotypeString = new String (haplotypeBases);
|
||||
String readString = new String(readBases);
|
||||
|
||||
|
||||
int i = bestI;
|
||||
int j = bestJ;
|
||||
|
||||
|
||||
System.out.println("Simple NW");
|
||||
|
||||
while (i >0 || j >0) {
|
||||
bestMetricIdx = bestMetricArray[i][j];
|
||||
System.out.print(bestMetricIdx);
|
||||
if (bestMetricIdx == UP) {
|
||||
// insert gap in Y
|
||||
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
|
||||
i--;
|
||||
} else if (bestMetricIdx == LEFT) {
|
||||
readString = readString.substring(0,i)+"-"+readString.substring(i);
|
||||
j--;
|
||||
}
|
||||
else {
|
||||
i--; j--;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
System.out.println("\nAlignment: ");
|
||||
System.out.println("R:"+readString);
|
||||
System.out.println("H:"+haplotypeString);
|
||||
System.out.println();
|
||||
|
||||
|
||||
}
|
||||
if (DEBUG)
|
||||
System.out.format("Likelihood: %5.4f\n", bestMetric);
|
||||
|
||||
return bestMetric;
|
||||
|
||||
|
||||
}
|
||||
|
||||
static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) {
|
||||
// compute forward hrun length, example:
|
||||
// AGGTGACCCCCCTGAGAG
|
||||
|
|
@ -479,14 +171,10 @@ public class PairHMMIndelErrorModel {
|
|||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
double c,d;
|
||||
matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int i=1; i < X_METRIC_LENGTH; i++) {
|
||||
|
|
@ -494,8 +182,6 @@ public class PairHMMIndelErrorModel {
|
|||
matchMetricArray[i][0] = Double.NEGATIVE_INFINITY;
|
||||
YMetricArray[i][0] = Double.NEGATIVE_INFINITY;
|
||||
XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability;
|
||||
|
||||
bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X;
|
||||
}
|
||||
|
||||
for (int j=1; j < Y_METRIC_LENGTH; j++) {
|
||||
|
|
@ -503,188 +189,46 @@ public class PairHMMIndelErrorModel {
|
|||
matchMetricArray[0][j] = Double.NEGATIVE_INFINITY;
|
||||
XMetricArray[0][j] = Double.NEGATIVE_INFINITY;
|
||||
YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability;
|
||||
|
||||
bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y;
|
||||
}
|
||||
|
||||
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
|
||||
int im1 = indI-1;
|
||||
final int im1 = indI-1;
|
||||
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
|
||||
int jm1 = indJ-1;
|
||||
byte x = readBases[im1];
|
||||
byte y = haplotypeBases[jm1];
|
||||
byte qual = readQuals[im1];
|
||||
|
||||
double bestMetric = 0.0;
|
||||
int bestMetricIdx = 0;
|
||||
|
||||
// compute metric for match/mismatch
|
||||
// workaround for reads whose bases quality = 0,
|
||||
if (qual < 1)
|
||||
qual = 1;
|
||||
|
||||
if (qual > MAX_CACHED_QUAL)
|
||||
qual = MAX_CACHED_QUAL;
|
||||
|
||||
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
|
||||
|
||||
|
||||
double[] metrics = new double[3];
|
||||
|
||||
|
||||
if (doViterbi) {
|
||||
// update match array
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[im1][jm1] + pBaseRead;
|
||||
metrics[X_OFFSET] = XMetricArray[im1][jm1] + pBaseRead;
|
||||
metrics[Y_OFFSET] = YMetricArray[im1][jm1] + pBaseRead;
|
||||
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead,
|
||||
YMetricArray[im1][jm1] + pBaseRead);
|
||||
final int jm1 = indJ-1;
|
||||
final byte x = readBases[im1];
|
||||
final byte y = haplotypeBases[jm1];
|
||||
final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]);
|
||||
final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
|
||||
|
||||
double bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead,
|
||||
XMetricArray[im1][jm1] + pBaseRead,
|
||||
YMetricArray[im1][jm1] + pBaseRead);
|
||||
matchMetricArray[indI][indJ] = bestMetric;
|
||||
bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx];
|
||||
|
||||
// update X array
|
||||
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
|
||||
// When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps
|
||||
|
||||
//c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]);
|
||||
//d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]);
|
||||
if (getGapPenaltiesFromFile) {
|
||||
c = currentGOP[im1];
|
||||
d = logGapContinuationProbability;
|
||||
|
||||
} else {
|
||||
c = currentGOP[jm1];
|
||||
d = currentGCP[jm1];
|
||||
}
|
||||
if (indJ == Y_METRIC_LENGTH-1)
|
||||
c = d = END_GAP_COST;
|
||||
|
||||
if (doViterbi) {
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[im1][indJ] + c;
|
||||
metrics[X_OFFSET] = XMetricArray[im1][indJ] + d;
|
||||
metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability;
|
||||
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c, XMetricArray[im1][indJ] + d);
|
||||
|
||||
final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
|
||||
final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
|
||||
bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
|
||||
XMetricArray[indI][indJ] = bestMetric;
|
||||
bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx];
|
||||
|
||||
// update Y array
|
||||
//c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]);
|
||||
//d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]);
|
||||
if (getGapPenaltiesFromFile) {
|
||||
c = currentGOP[im1];
|
||||
d = logGapContinuationProbability;
|
||||
}
|
||||
else {
|
||||
c = currentGOP[jm1];
|
||||
d = currentGCP[jm1];
|
||||
}
|
||||
if (indI == X_METRIC_LENGTH-1)
|
||||
c = d = END_GAP_COST;
|
||||
|
||||
|
||||
|
||||
if (doViterbi) {
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[indI][jm1] + c;
|
||||
metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability;
|
||||
metrics[Y_OFFSET] = YMetricArray[indI][jm1] + d;
|
||||
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c, YMetricArray[indI][jm1] + d);
|
||||
|
||||
final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
|
||||
final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
|
||||
bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
|
||||
YMetricArray[indI][indJ] = bestMetric;
|
||||
bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx];
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
double bestMetric;
|
||||
double metrics[] = new double[3];
|
||||
int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ];
|
||||
metrics[X_OFFSET] = XMetricArray[bestI][bestJ];
|
||||
metrics[Y_OFFSET] = YMetricArray[bestI][bestJ];
|
||||
if (doViterbi) {
|
||||
bestTable = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestTable];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
|
||||
final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
|
||||
XMetricArray[bestI][bestJ],
|
||||
YMetricArray[bestI][bestJ]);
|
||||
|
||||
// Do traceback (needed only for debugging!)
|
||||
if (DEBUG && doViterbi) {
|
||||
|
||||
int bestAction;
|
||||
int i = bestI;
|
||||
int j = bestJ;
|
||||
|
||||
|
||||
System.out.println("Affine gap NW");
|
||||
|
||||
|
||||
String haplotypeString = new String (haplotypeBases);
|
||||
String readString = new String(readBases);
|
||||
|
||||
|
||||
while (i >0 || j >0) {
|
||||
if (bestTable == X_OFFSET) {
|
||||
// insert gap in Y
|
||||
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
|
||||
bestAction = bestActionArrayX[i][j];
|
||||
}
|
||||
else if (bestTable == Y_OFFSET) {
|
||||
readString = readString.substring(0,i)+"-"+readString.substring(i);
|
||||
bestAction = bestActionArrayY[i][j];
|
||||
|
||||
}
|
||||
else {
|
||||
bestAction = bestActionArrayM[i][j];
|
||||
}
|
||||
System.out.print(bestAction);
|
||||
|
||||
|
||||
// bestAction contains action to take at next step
|
||||
// encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table
|
||||
|
||||
// bestTable and nextDirection for next step
|
||||
bestTable = bestAction & 0x3;
|
||||
int nextDirection = bestAction >> 2;
|
||||
if (nextDirection == UP) {
|
||||
i--;
|
||||
} else if (nextDirection == LEFT) {
|
||||
j--;
|
||||
} else { // if (nextDirection == DIAG)
|
||||
i--; j--;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
System.out.println("\nAlignment: ");
|
||||
System.out.println("R:"+readString);
|
||||
System.out.println("H:"+haplotypeString);
|
||||
System.out.println();
|
||||
|
||||
|
||||
}
|
||||
if (DEBUG)
|
||||
System.out.format("Likelihood: %5.4f\n", bestMetric);
|
||||
|
||||
|
|
@ -707,12 +251,12 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
|
||||
ReferenceContext ref, int eventLength,
|
||||
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
|
||||
ReferenceContext ref, int eventLength,
|
||||
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
|
||||
|
||||
int numHaplotypes = haplotypeMap.size();
|
||||
double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
|
||||
final double readLikelihoods[][] = new double[pileup.size()][numHaplotypes];
|
||||
final int readCounts[] = new int[pileup.size()];
|
||||
int readIdx=0;
|
||||
|
||||
LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
|
||||
|
|
@ -723,34 +267,35 @@ public class PairHMMIndelErrorModel {
|
|||
System.out.println(new String(ref.getBases()));
|
||||
}
|
||||
|
||||
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
|
||||
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
|
||||
|
||||
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
||||
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
|
||||
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
|
||||
|
||||
// get homopolymer length profile for current haplotype
|
||||
int[] hrunProfile = new int[haplotypeBases.length];
|
||||
getContextHomopolymerLength(haplotypeBases,hrunProfile);
|
||||
if (DEBUG) {
|
||||
System.out.println("Haplotype bases:");
|
||||
System.out.println(new String(haplotypeBases));
|
||||
for (int i=0; i < hrunProfile.length; i++)
|
||||
System.out.format("%d",hrunProfile[i]);
|
||||
System.out.println();
|
||||
}
|
||||
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
|
||||
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
|
||||
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
|
||||
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
|
||||
// todo -- refactor into separate function
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
||||
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
|
||||
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
|
||||
|
||||
// get homopolymer length profile for current haplotype
|
||||
int[] hrunProfile = new int[haplotypeBases.length];
|
||||
getContextHomopolymerLength(haplotypeBases,hrunProfile);
|
||||
if (DEBUG) {
|
||||
System.out.println("Haplotype bases:");
|
||||
System.out.println(new String(haplotypeBases));
|
||||
for (int i=0; i < hrunProfile.length; i++)
|
||||
System.out.format("%d",hrunProfile[i]);
|
||||
System.out.println();
|
||||
}
|
||||
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
|
||||
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
|
||||
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
|
||||
|
||||
}
|
||||
|
||||
for (PileupElement p: pileup) {
|
||||
// > 1 when the read is a consensus read representing multiple independent observations
|
||||
final boolean isReduced = ReadUtils.isReducedRead(p.getRead());
|
||||
readCounts[readIdx] = isReduced ? p.getReducedCount() : 1;
|
||||
|
||||
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
|
||||
if (indelLikelihoodMap.containsKey(p)) {
|
||||
|
|
@ -762,61 +307,20 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
else {
|
||||
//System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
|
||||
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
|
||||
SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
continue;
|
||||
|
||||
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
|
||||
if ( isReduced ) {
|
||||
read = ReadUtils.reducedReadWithReducedQuals(read);
|
||||
}
|
||||
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double[] recalQuals = null;
|
||||
|
||||
/*
|
||||
if (getGapPenaltiesFromFile) {
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
|
||||
|
||||
recalQuals = new double[read.getReadLength()];
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = -((double)qualityScore)/10.0;
|
||||
}
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
if (DEBUG) {
|
||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||
read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString());
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%c",bases[k]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%.0f ",recalQuals[k]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
} */
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
final int trailingBases = 3;
|
||||
|
||||
|
|
@ -937,11 +441,6 @@ public class PairHMMIndelErrorModel {
|
|||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
double[] recalCDP = null;
|
||||
if (getGapPenaltiesFromFile) {
|
||||
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Read bases:");
|
||||
|
|
@ -971,27 +470,9 @@ public class PairHMMIndelErrorModel {
|
|||
System.out.println(new String(haplotypeBases));
|
||||
}
|
||||
|
||||
Double readLikelihood = 0.0;
|
||||
if (useAffineGapModel) {
|
||||
|
||||
double[] currentContextGOP = null;
|
||||
double[] currentContextGCP = null;
|
||||
|
||||
if (doContextDependentPenalties) {
|
||||
|
||||
if (getGapPenaltiesFromFile) {
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
|
||||
|
||||
} else {
|
||||
currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
||||
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||
|
||||
readEl.put(a,readLikelihood);
|
||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||
|
|
@ -1004,7 +485,7 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
if (DEBUG) {
|
||||
System.out.println("\nLikelihood summary");
|
||||
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||
for (readIdx=0; readIdx < pileup.size(); readIdx++) {
|
||||
System.out.format("Read Index: %d ",readIdx);
|
||||
for (int i=0; i < readLikelihoods[readIdx].length; i++)
|
||||
System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]);
|
||||
|
|
@ -1012,123 +493,41 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
|
||||
}
|
||||
|
||||
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
|
||||
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
||||
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix
|
||||
for (int i=0; i < numHaplotypes; i++) {
|
||||
for (int j=i; j < numHaplotypes; j++){
|
||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||
|
||||
for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
|
||||
continue;
|
||||
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
|
||||
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
|
||||
|
||||
final double li = readLikelihoods[readIdx][i];
|
||||
final double lj = readLikelihoods[readIdx][j];
|
||||
final int readCount = readCounts[readIdx];
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
|
||||
|
||||
}
|
||||
|
||||
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
int hSize = haplotypeLikehoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
|
||||
int k=0;
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for (int j=0; j < hSize; j++) {
|
||||
for (int j=0; j < numHaplotypes; j++) {
|
||||
for (int i=0; i <= j; i++){
|
||||
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
|
||||
if (haplotypeLikehoodMatrix[i][j] > maxElement)
|
||||
maxElement = haplotypeLikehoodMatrix[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// renormalize
|
||||
for (int i=0; i < genotypeLikelihoods.length; i++)
|
||||
genotypeLikelihoods[i] -= maxElement;
|
||||
|
||||
return genotypeLikelihoods;
|
||||
// renormalize so that max element is zero.
|
||||
return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
/*
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
//return newQualityByte;
|
||||
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
|
||||
* but this latter functionality is now superceded by UnifiedGenotyper.
|
||||
*
|
||||
* <p>
|
||||
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
|
||||
* data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
|
||||
* and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
|
||||
* or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
|
||||
* the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
|
||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
|
||||
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
|
||||
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
|
||||
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
|
||||
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
|
||||
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
|
||||
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
|
||||
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
|
||||
* metrics for the post-processing tools to make the final decision). The calls are performed by default
|
||||
* from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
|
||||
* command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
|
||||
* respectively. Indels are called from the tumor sample and annotated as germline
|
||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
|
||||
* if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
|
||||
* is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
|
||||
* in tumor sample, it will not be reported.
|
||||
*
|
||||
* <b>If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
|
||||
* please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.</b>
|
||||
* To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
|
||||
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
|
||||
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Indel calls with associated metrics.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SomaticIndelDetector \
|
||||
* -o indels.vcf \
|
||||
* -verbose indels.txt
|
||||
* -I:normal normal.bam \
|
||||
* -I:tumor tumor.bam
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
|
||||
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||
// @Output
|
||||
// PrintStream out;
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
@Output(doc="File to write variants (indels) in VCF format",required=true)
|
||||
protected VCFWriter vcf_writer = null;
|
||||
|
||||
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
|
||||
|
|
@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
@Hidden
|
||||
@Argument(fullName = "genotype_intervals", shortName = "genotype",
|
||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
|
||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
|
||||
public String genotypeIntervalsFile = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
|
||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||
"to sort and keep it in memory (increases memory usage!).")
|
||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||
"to sort and keep it in memory (increases memory usage!).")
|
||||
protected boolean GENOTYPE_NOT_SORTED = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="unpaired", shortName="unpaired",
|
||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||
@Argument(fullName="unpaired", shortName="unpaired",
|
||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||
boolean call_unpaired = false;
|
||||
boolean call_somatic ;
|
||||
boolean call_somatic ;
|
||||
|
||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||
doc="Verbose output file in text format", required=false)
|
||||
java.io.File verboseOutput = null;
|
||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||
doc="Verbose output file in text format", required=false)
|
||||
java.io.File verboseOutput = null;
|
||||
|
||||
@Argument(fullName="bedOutput", shortName="bed",
|
||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
java.io.File bedOutput = null;
|
||||
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
|
||||
int minCoverage = 6;
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
|
||||
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
|
||||
int minCoverage = 6;
|
||||
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
|
||||
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
double minFraction = 0.3;
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
double minFraction = 0.3;
|
||||
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
|
||||
"all indel observations at the site exceeds this threshold", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
int minIndelCount = 0;
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
int minIndelCount = 0;
|
||||
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
|
||||
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
|
||||
@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
"by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
PlatformUnitFilterHelper dummy;
|
||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
|
||||
//@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
//PlatformUnitFilterHelper dummy;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
|
||||
required=false) Boolean DEBUG = false;
|
||||
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
|
||||
"May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
|
||||
"May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
|
||||
"window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
|
||||
"than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
|
||||
"if window is too small",required=false) int WINDOW_SIZE = 200;
|
||||
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
|
||||
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
|
||||
|
||||
|
||||
|
||||
private WindowContext tumor_context;
|
||||
private WindowContext normal_context;
|
||||
private int currentContigIndex = -1;
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray {
|
|||
public PhasingRead(int length, int mappingQual) {
|
||||
super(length);
|
||||
|
||||
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
|
||||
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual));
|
||||
|
||||
this.baseProbs = new PreciseNonNegativeDouble[length];
|
||||
Arrays.fill(this.baseProbs, null);
|
||||
|
|
|
|||
|
|
@ -44,12 +44,12 @@ public class RefSeqDataParser {
|
|||
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
|
||||
|
||||
Map<String, String> entriesToNames = new HashMap<String, String>();
|
||||
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
|
||||
if (numRecords != null) {
|
||||
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
|
||||
if (numRecords != -1) {
|
||||
boolean done = false;
|
||||
|
||||
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
|
||||
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
|
||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
done = true;
|
||||
|
|
@ -59,14 +59,14 @@ public class RefSeqDataParser {
|
|||
if (!done) {
|
||||
for (int i = 1; i <= numRecords; i++) {
|
||||
String key = nameKeyToUseMultiplePrefix + i;
|
||||
String name = vc.getAttributeAsStringNoException(key);
|
||||
String name = vc.getAttributeAsString(key, null);
|
||||
if (name != null)
|
||||
entriesToNames.put(key, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
else { // no entry with the # of records:
|
||||
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
|
||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import java.util.*;
|
|||
*
|
||||
* <p>Body test</p>
|
||||
*/
|
||||
@Hidden
|
||||
public class DocumentationTest extends RodWalker<Integer, Integer> {
|
||||
// the docs for the arguments are in the collection
|
||||
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
|
|||
|
|
@ -76,6 +76,42 @@ import java.util.Map;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
*
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* * The file begins with a series of comment lines describing:
|
||||
* ** The number of counted loci
|
||||
* ** The number of counted bases
|
||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*
|
||||
* <pre>
|
||||
* # Counted Sites 19451059
|
||||
* # Counted Bases 56582018
|
||||
* # Skipped Sites 82666
|
||||
* # Fraction Skipped 1 / 235 bp
|
||||
* ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
|
||||
* SRR006446,11,65,CA,9,1,10
|
||||
* SRR006446,11,48,TA,10,0,40
|
||||
* SRR006446,11,67,AA,27,0,40
|
||||
* SRR006446,11,61,GA,11,1,10
|
||||
* SRR006446,12,34,CA,47,1,17
|
||||
* SRR006446,12,30,GA,52,1,17
|
||||
* SRR006446,12,36,AA,352,1,25
|
||||
* SRR006446,12,17,TA,182,11,12
|
||||
* SRR006446,11,48,TG,2,0,40
|
||||
* SRR006446,11,67,AG,1,0,40
|
||||
* SRR006446,12,34,CG,9,0,40
|
||||
* SRR006446,12,30,GG,43,0,40
|
||||
* ERR001876,4,31,AG,1,0,40
|
||||
* ERR001876,4,31,AT,2,2,1
|
||||
* ERR001876,4,31,CA,1,0,40
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ import java.util.List;
|
|||
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
|
||||
*</pre>
|
||||
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
|
||||
*
|
||||
*<pre>
|
||||
* Valid // amplicon is valid
|
||||
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
|
||||
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
|
||||
|
|
@ -72,10 +72,10 @@ import java.util.List;
|
|||
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
|
||||
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
|
||||
* </p>
|
||||
* </pre></p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre></pre>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ValidationAmplicons
|
||||
|
|
|
|||
|
|
@ -55,7 +55,23 @@ import java.util.*;
|
|||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Evaluation tables.
|
||||
* Evaluation tables detailing the results of the eval modules which were applied.
|
||||
* For example:
|
||||
* <pre>
|
||||
* output.eval.gatkreport:
|
||||
* ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
|
||||
* CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
|
||||
* CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
|
||||
* CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
|
||||
* CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
|
||||
* CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
|
||||
* CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
|
||||
* CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
|
||||
* CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
|
||||
* CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
|
||||
* CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
|
||||
* ...
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
@ -149,12 +165,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
|
||||
|
||||
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false)
|
||||
private String TRANCHE_FILENAME = null;
|
||||
|
||||
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
|
||||
private File ancestralAlignmentsFile = null;
|
||||
|
||||
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
|
||||
private boolean requireStrictAlleleMatch = false;
|
||||
|
||||
// Variables
|
||||
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
|
||||
|
||||
|
|
@ -226,16 +242,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
}
|
||||
sampleNamesForStratification.add(ALL_SAMPLE_NAME);
|
||||
|
||||
// Add select expressions for anything in the tranches file
|
||||
if ( TRANCHE_FILENAME != null ) {
|
||||
// we are going to build a few select names automatically from the tranches file
|
||||
for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) {
|
||||
logger.info("Adding select for all variant above the pCut of : " + t);
|
||||
SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod));
|
||||
SELECT_NAMES.add(String.format("TS-%.2f", t.ts));
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize select expressions
|
||||
for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
|
||||
SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
|
||||
|
|
@ -245,18 +251,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
// Initialize the set of stratifications and evaluations to use
|
||||
stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
|
||||
Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
|
||||
boolean usingJEXL = false;
|
||||
for ( VariantStratifier vs : getStratificationObjects() ) {
|
||||
if ( vs.getClass().getSimpleName().equals("Filter") )
|
||||
byFilterIsEnabled = true;
|
||||
else if ( vs.getClass().getSimpleName().equals("Sample") )
|
||||
perSampleIsEnabled = true;
|
||||
usingJEXL = usingJEXL || vs.getClass().equals(JexlExpression.class);
|
||||
}
|
||||
|
||||
if ( TRANCHE_FILENAME != null && ! usingJEXL )
|
||||
throw new UserException.BadArgumentValue("tf", "Requires the JexlExpression ST to enabled");
|
||||
|
||||
// Initialize the evaluation contexts
|
||||
evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null);
|
||||
|
||||
|
|
@ -378,16 +379,16 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
if ( matchingComps.size() == 0 )
|
||||
return null;
|
||||
|
||||
// find the comp which matches the alternate allele from eval
|
||||
// find the comp which matches both the reference allele and alternate allele from eval
|
||||
Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
|
||||
for ( VariantContext comp : matchingComps ) {
|
||||
Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
|
||||
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) )
|
||||
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) )
|
||||
return comp;
|
||||
}
|
||||
|
||||
// if none match, just return the first one
|
||||
return matchingComps.get(0);
|
||||
// if none match, just return the first one unless we require a strict match
|
||||
return (requireStrictAlleleMatch ? null : matchingComps.get(0));
|
||||
}
|
||||
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }
|
||||
|
|
|
|||
|
|
@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
|||
@DataPoint(description = "number of eval SNP sites")
|
||||
long nEvalVariants = 0;
|
||||
|
||||
@DataPoint(description = "number of comp SNP sites")
|
||||
long nCompVariants = 0;
|
||||
|
||||
@DataPoint(description = "number of eval sites outside of comp sites")
|
||||
long novelSites = 0;
|
||||
|
||||
|
|
@ -75,10 +72,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
|||
}
|
||||
|
||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
boolean evalIsGood = eval != null && eval.isVariant();
|
||||
boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType());
|
||||
boolean evalIsGood = eval != null && eval.isPolymorphic();
|
||||
boolean compIsGood = comp != null && comp.isNotFiltered();
|
||||
|
||||
if (compIsGood) nCompVariants++; // count the number of comp events
|
||||
if (evalIsGood) nEvalVariants++; // count the number of eval events
|
||||
|
||||
if (compIsGood && evalIsGood) {
|
||||
|
|
|
|||
|
|
@ -100,21 +100,22 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
|
||||
// added in a proxy check for monomorphic status here.
|
||||
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
|
||||
if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) {
|
||||
if ( vc1.isMonomorphic() ) {
|
||||
nRefLoci++;
|
||||
} else {
|
||||
switch (vc1.getType()) {
|
||||
case NO_VARIATION:
|
||||
// shouldn't get here
|
||||
break;
|
||||
case SNP:
|
||||
nVariantLoci++;
|
||||
nSNPs++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
|
||||
break;
|
||||
case MNP:
|
||||
nVariantLoci++;
|
||||
nMNPs++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
|
||||
break;
|
||||
case INDEL:
|
||||
nVariantLoci++;
|
||||
|
|
@ -136,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
|
||||
String refStr = vc1.getReference().getBaseString().toUpperCase();
|
||||
|
||||
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null;
|
||||
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null;
|
||||
// if (aaStr.equals(".")) {
|
||||
// aaStr = refStr;
|
||||
// }
|
||||
|
|
|
|||
|
|
@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
|||
}
|
||||
|
||||
public static Double getPQ(Genotype gt) {
|
||||
return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
|
||||
Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
|
||||
return d == -1 ? null : d;
|
||||
}
|
||||
|
||||
public static boolean topMatchesTop(AllelePair b1, AllelePair b2) {
|
||||
|
|
|
|||
|
|
@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator {
|
|||
public int getComparisonOrder() { return 1; } // need only the evals
|
||||
|
||||
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( ! vc1.isBiallelic() && vc1.isIndel() ) {
|
||||
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
|
||||
return vc1.toString(); // biallelic sites are output
|
||||
}
|
||||
|
||||
if ( vc1.isIndel() ) {
|
||||
if ( vc1.isIndel() && vc1.isPolymorphic() ) {
|
||||
|
||||
if ( ! vc1.isBiallelic() ) {
|
||||
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
|
||||
return vc1.toString(); // biallelic sites are output
|
||||
}
|
||||
|
||||
// only count simple insertions/deletions, not complex indels
|
||||
if ( vc1.isSimpleInsertion() ) {
|
||||
indelHistogram.update(vc1.getAlternateAllele(0).length());
|
||||
} else if ( vc1.isSimpleDeletion() ) {
|
||||
indelHistogram.update(-vc1.getReference().length());
|
||||
} else {
|
||||
throw new ReviewedStingException("Indel type that is not insertion or deletion.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -270,7 +270,7 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
|
||||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
if (eval != null ) {
|
||||
if (eval != null && eval.isPolymorphic()) {
|
||||
if ( indelStats == null ) {
|
||||
indelStats = new IndelStats(eval);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
|
|||
if ( eval.hasGenotypes() )
|
||||
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
|
||||
else if ( eval.hasAttribute("AC") ) {
|
||||
ac = Integer.valueOf(eval.getAttributeAsString("AC"));
|
||||
ac = eval.getAttributeAsInt("AC", -1);
|
||||
}
|
||||
|
||||
if ( ac != -1 ) {
|
||||
|
|
@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
|
|||
}
|
||||
}
|
||||
|
||||
if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) {
|
||||
if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) {
|
||||
metrics.incrValue(eval);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
|
|||
}
|
||||
|
||||
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) {
|
||||
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) {
|
||||
return null; //no interesting sites
|
||||
}
|
||||
|
||||
if (vc.hasGenotypes()) {
|
||||
//this maps allele to a count
|
||||
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
|
||||
|
||||
//this maps allele to a count
|
||||
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
|
||||
int numHetsHere = 0;
|
||||
float numGenosHere = 0;
|
||||
int numIndsHere = 0;
|
||||
|
||||
int numHetsHere = 0;
|
||||
float numGenosHere = 0;
|
||||
int numIndsHere = 0;
|
||||
for (Genotype genotype : vc.getGenotypes().values()) {
|
||||
numIndsHere++;
|
||||
if (!genotype.isNoCall()) {
|
||||
//increment stats for heterozygosity
|
||||
if (genotype.isHet()) {
|
||||
numHetsHere++;
|
||||
}
|
||||
|
||||
for (Genotype genotype : vc.getGenotypes().values()) {
|
||||
numIndsHere++;
|
||||
if (!genotype.isNoCall()) {
|
||||
//increment stats for heterozygosity
|
||||
if (genotype.isHet()) {
|
||||
numHetsHere++;
|
||||
}
|
||||
numGenosHere++;
|
||||
//increment stats for pairwise mismatches
|
||||
|
||||
numGenosHere++;
|
||||
//increment stats for pairwise mismatches
|
||||
|
||||
for (Allele allele : genotype.getAlleles()) {
|
||||
if (allele.isNonNull() && allele.isCalled()) {
|
||||
String alleleString = allele.toString();
|
||||
alleleCounts.putIfAbsent(alleleString, 0);
|
||||
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
|
||||
}
|
||||
for (Allele allele : genotype.getAlleles()) {
|
||||
if (allele.isNonNull() && allele.isCalled()) {
|
||||
String alleleString = allele.toString();
|
||||
alleleCounts.putIfAbsent(alleleString, 0);
|
||||
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numGenosHere > 0) {
|
||||
//only if have one called genotype at least
|
||||
this.numSites++;
|
||||
}
|
||||
if (numGenosHere > 0) {
|
||||
//only if have one called genotype at least
|
||||
this.numSites++;
|
||||
|
||||
this.totalHet += numHetsHere / numGenosHere;
|
||||
this.totalHet += numHetsHere / numGenosHere;
|
||||
|
||||
//compute based on num sites
|
||||
float harmonicFactor = 0;
|
||||
for (int i = 1; i <= numIndsHere; i++) {
|
||||
harmonicFactor += 1.0 / i;
|
||||
}
|
||||
this.thetaRegionNumSites += 1.0 / harmonicFactor;
|
||||
//compute based on num sites
|
||||
float harmonicFactor = 0;
|
||||
for (int i = 1; i <= numIndsHere; i++) {
|
||||
harmonicFactor += 1.0 / i;
|
||||
}
|
||||
this.thetaRegionNumSites += 1.0 / harmonicFactor;
|
||||
|
||||
//now compute pairwise mismatches
|
||||
float numPairwise = 0;
|
||||
float numDiffs = 0;
|
||||
for (String allele1 : alleleCounts.keySet()) {
|
||||
int allele1Count = alleleCounts.get(allele1);
|
||||
//now compute pairwise mismatches
|
||||
float numPairwise = 0;
|
||||
float numDiffs = 0;
|
||||
for (String allele1 : alleleCounts.keySet()) {
|
||||
int allele1Count = alleleCounts.get(allele1);
|
||||
|
||||
for (String allele2 : alleleCounts.keySet()) {
|
||||
if (allele1.compareTo(allele2) < 0) {
|
||||
continue;
|
||||
}
|
||||
if (allele1 .compareTo(allele2) == 0) {
|
||||
numPairwise += allele1Count * (allele1Count - 1) * .5;
|
||||
for (String allele2 : alleleCounts.keySet()) {
|
||||
if (allele1.compareTo(allele2) < 0) {
|
||||
continue;
|
||||
}
|
||||
if (allele1 .compareTo(allele2) == 0) {
|
||||
numPairwise += allele1Count * (allele1Count - 1) * .5;
|
||||
|
||||
}
|
||||
else {
|
||||
int allele2Count = alleleCounts.get(allele2);
|
||||
numPairwise += allele1Count * allele2Count;
|
||||
numDiffs += allele1Count * allele2Count;
|
||||
}
|
||||
}
|
||||
else {
|
||||
int allele2Count = alleleCounts.get(allele2);
|
||||
numPairwise += allele1Count * allele2Count;
|
||||
numDiffs += allele1Count * allele2Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numPairwise > 0) {
|
||||
this.totalAvgDiffs += numDiffs / numPairwise;
|
||||
}
|
||||
if (numPairwise > 0) {
|
||||
this.totalAvgDiffs += numDiffs / numPairwise;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
|
|||
}
|
||||
|
||||
public void updateTiTv(VariantContext vc, boolean updateStandard) {
|
||||
if (vc != null && vc.isSNP() && vc.isBiallelic()) {
|
||||
if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) {
|
||||
if (VariantContextUtils.isTransition(vc)) {
|
||||
if (updateStandard) nTiInComp++;
|
||||
else nTi++;
|
||||
|
|
@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
|
|||
else nTv++;
|
||||
}
|
||||
|
||||
String refStr = vc.getReference().getBaseString().toUpperCase();
|
||||
String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase();
|
||||
|
||||
if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) {
|
||||
BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]);
|
||||
|
||||
//System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType);
|
||||
|
||||
if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) {
|
||||
nTiDerived++;
|
||||
} else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) {
|
||||
nTvDerived++;
|
||||
if (vc.hasAttribute("ANCESTRALALLELE")) {
|
||||
final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase();
|
||||
if ( ! aaStr.equals(".") ) {
|
||||
switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) {
|
||||
case TRANSITION: nTiDerived++; break;
|
||||
case TRANSVERSION: nTvDerived++; break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
|
|||
public SiteStatus calcSiteStatus(VariantContext vc) {
|
||||
if ( vc == null ) return SiteStatus.NO_CALL;
|
||||
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
|
||||
if ( ! vc.isVariant() ) return SiteStatus.MONO;
|
||||
if ( vc.isMonomorphic() ) return SiteStatus.MONO;
|
||||
if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes
|
||||
|
||||
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
|
||||
int ac = 0;
|
||||
|
|
@ -130,10 +131,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
|
|||
//// System.out.printf(" ac = %d%n", ac);
|
||||
}
|
||||
else
|
||||
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
|
||||
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
|
||||
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
|
||||
} else if ( vc.hasGenotypes() ) {
|
||||
return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO;
|
||||
} else {
|
||||
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
|
||||
//return SiteStatus.NO_CALL; // we can't figure out what to do
|
||||
|
|
|
|||
|
|
@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator {
|
|||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
final String interesting = null;
|
||||
|
||||
if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||
if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||
if( titvStats == null ) { titvStats = new TiTvStats(); }
|
||||
titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier {
|
|||
if (eval != null) {
|
||||
int AC = -1;
|
||||
if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) {
|
||||
AC = eval.getAttributeAsInt("AC");
|
||||
AC = eval.getAttributeAsInt("AC", 0);
|
||||
} else if ( eval.isVariant() ) {
|
||||
for (Allele allele : eval.getAlternateAlleles())
|
||||
AC = Math.max(AC, eval.getChromosomeCount(allele));
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier {
|
|||
|
||||
if (eval != null) {
|
||||
try {
|
||||
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3))));
|
||||
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3))));
|
||||
} catch (Exception e) {
|
||||
return relevantStates;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier {
|
|||
Integer frame = null;
|
||||
|
||||
if (eval.hasAttribute("refseq.functionalClass")) {
|
||||
aa = eval.getAttributeAsString("refseq.variantAA");
|
||||
frame = eval.getAttributeAsInt("refseq.frame");
|
||||
aa = eval.getAttributeAsString("refseq.variantAA", null);
|
||||
frame = eval.getAttributeAsInt("refseq.frame", 0);
|
||||
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
|
||||
int annotationId = 1;
|
||||
String key;
|
||||
|
|
@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier {
|
|||
do {
|
||||
key = String.format("refseq.functionalClass_%d", annotationId);
|
||||
|
||||
String newtype = eval.getAttributeAsString(key);
|
||||
String newtype = eval.getAttributeAsString(key, null);
|
||||
|
||||
if ( newtype != null &&
|
||||
( type == null ||
|
||||
|
|
@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier {
|
|||
type = newtype;
|
||||
|
||||
String aakey = String.format("refseq.variantAA_%d", annotationId);
|
||||
aa = eval.getAttributeAsString(aakey);
|
||||
aa = eval.getAttributeAsString(aakey, null);
|
||||
|
||||
if (aa != null) {
|
||||
String framekey = String.format("refseq.frame_%d", annotationId);
|
||||
|
||||
if (eval.hasAttribute(framekey)) {
|
||||
frame = eval.getAttributeAsInt(framekey);
|
||||
frame = eval.getAttributeAsInt(framekey, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -11,25 +12,34 @@ import java.util.List;
|
|||
* Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
|
||||
*/
|
||||
public class FunctionalClass extends VariantStratifier {
|
||||
@Override
|
||||
public void initialize() {
|
||||
states.add("all");
|
||||
states.add("silent");
|
||||
states.add("missense");
|
||||
states.add("nonsense");
|
||||
|
||||
public enum FunctionalType {
|
||||
silent,
|
||||
missense,
|
||||
nonsense
|
||||
}
|
||||
|
||||
|
||||
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
|
||||
@Override
|
||||
public void initialize() {
|
||||
states.add("all");
|
||||
for ( FunctionalType type : FunctionalType.values() )
|
||||
states.add(type.name());
|
||||
}
|
||||
|
||||
|
||||
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
|
||||
ArrayList<String> relevantStates = new ArrayList<String>();
|
||||
|
||||
relevantStates.add("all");
|
||||
|
||||
if (eval != null && eval.isVariant()) {
|
||||
String type = null;
|
||||
FunctionalType type = null;
|
||||
|
||||
if (eval.hasAttribute("refseq.functionalClass")) {
|
||||
type = eval.getAttributeAsString("refseq.functionalClass");
|
||||
try {
|
||||
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null));
|
||||
} catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
|
||||
int annotationId = 1;
|
||||
String key;
|
||||
|
|
@ -37,24 +47,36 @@ public class FunctionalClass extends VariantStratifier {
|
|||
do {
|
||||
key = String.format("refseq.functionalClass_%d", annotationId);
|
||||
|
||||
String newtype = eval.getAttributeAsString(key);
|
||||
|
||||
if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
|
||||
( type == null ||
|
||||
( type.equals("silent") && !newtype.equals("silent") ) ||
|
||||
( type.equals("missense") && newtype.equals("nonsense") ) )
|
||||
) {
|
||||
type = newtype;
|
||||
String newtypeStr = eval.getAttributeAsString(key, null);
|
||||
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
|
||||
try {
|
||||
FunctionalType newType = FunctionalType.valueOf(newtypeStr);
|
||||
if ( type == null ||
|
||||
( type == FunctionalType.silent && newType != FunctionalType.silent ) ||
|
||||
( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) {
|
||||
type = newType;
|
||||
}
|
||||
} catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
}
|
||||
|
||||
annotationId++;
|
||||
} while (eval.hasAttribute(key));
|
||||
|
||||
} else if ( eval.hasAttribute(SnpEff.InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()) ) {
|
||||
try {
|
||||
SnpEff.EffectFunctionalClass snpEffFunctionalClass = SnpEff.EffectFunctionalClass.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()).toString());
|
||||
if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.NONSENSE )
|
||||
type = FunctionalType.nonsense;
|
||||
else if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.MISSENSE )
|
||||
type = FunctionalType.missense;
|
||||
else if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.SILENT )
|
||||
type = FunctionalType.silent;
|
||||
}
|
||||
catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
}
|
||||
|
||||
if (type != null) {
|
||||
if (type.equals("silent")) { relevantStates.add("silent"); }
|
||||
else if (type.equals("missense")) { relevantStates.add("missense"); }
|
||||
else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
|
||||
if ( type != null ) {
|
||||
relevantStates.add(type.name());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -277,7 +277,7 @@ public class VariantEvalUtils {
|
|||
* @return a new VariantContext with just the requested samples
|
||||
*/
|
||||
public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection<String> sampleNames) {
|
||||
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values());
|
||||
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles());
|
||||
|
||||
HashMap<String, Object> newAts = new HashMap<String, Object>(vcsub.getAttributes());
|
||||
|
||||
|
|
@ -354,7 +354,7 @@ public class VariantEvalUtils {
|
|||
|
||||
private void addMapping(HashMap<String, Set<VariantContext>> mappings, String sample, VariantContext vc) {
|
||||
if ( !mappings.containsKey(sample) )
|
||||
mappings.put(sample, new HashSet<VariantContext>());
|
||||
mappings.put(sample, new LinkedHashSet<VariantContext>());
|
||||
mappings.get(sample).add(vc);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/12/11
|
||||
*/
|
||||
|
||||
public class TrainingSet {
|
||||
|
||||
public RodBinding<VariantContext> rodBinding;
|
||||
public boolean isKnown = false;
|
||||
public boolean isTraining = false;
|
||||
public boolean isAntiTraining = false;
|
||||
public boolean isTruth = false;
|
||||
public boolean isConsensus = false;
|
||||
public double prior = 0.0;
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(TrainingSet.class);
|
||||
|
||||
public TrainingSet( final RodBinding<VariantContext> rodBinding) {
|
||||
this.rodBinding = rodBinding;
|
||||
|
||||
final Tags tags = rodBinding.getTags();
|
||||
final String name = rodBinding.getName();
|
||||
|
||||
// Parse the tags to decide which tracks have which properties
|
||||
if( tags != null ) {
|
||||
isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
|
||||
isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
|
||||
isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
|
||||
isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
|
||||
isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
|
||||
prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
|
||||
}
|
||||
|
||||
// Report back to the user which tracks were found and the properties that were detected
|
||||
if( !isConsensus && !isAntiTraining ) {
|
||||
logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
|
||||
} else if( isConsensus ) {
|
||||
logger.info( String.format( "Found consensus track: %s", name) );
|
||||
} else {
|
||||
logger.info( String.format( "Found bad sites training track: %s", name) );
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue