Bringing latest updates of ReduceReads to the master repository
This commit is contained in:
commit
758ecf2d43
299
build.xml
299
build.xml
|
|
@ -163,6 +163,14 @@
|
|||
<!-- Remove old versions of ivy jars AFTER the ivy:retrieve has been class loaded. -->
|
||||
<delete file="${ivy.jar.dir}/ivy-2.0.0.jar"/>
|
||||
<delete file="${ivy.jar.dir}/ivy-2.2.0-rc1.jar"/>
|
||||
<!--
|
||||
An old versions of the ivy-1.4.1.xml does not contain /ivy-module/configuration/conf/@name="compile".
|
||||
Easier to upgrade to 1.4.4 than try to deal with xmlproperty and conditional deletion in ant.
|
||||
Just in case we remove explicit 1.4.4 and go back to 1.4.1, try to clean out the file for now.
|
||||
-->
|
||||
<delete file="${ivy.home}/cache/javax.mail/mail/ivy-1.4.1.xml"/>
|
||||
<delete file="${ivy.home}/cache/javax.mail/mail/ivydata-1.4.1.properties"/>
|
||||
<delete file="${ivy.home}/cache/javax.mail/mail/jars/mail-1.4.1.jar"/>
|
||||
</target>
|
||||
|
||||
<target name="init.buildall">
|
||||
|
|
@ -709,53 +717,6 @@
|
|||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.init.compile">
|
||||
<mkdir dir="${java.test.classes}"/>
|
||||
<mkdir dir="${scala.test.classes}"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.java.compile" depends="init.buildall,dist,test.init.compile">
|
||||
<echo message="Sting: Compiling test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src path="${java.public.test.sources}"/>
|
||||
<src path="${java.private.test.sources}"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
<!--
|
||||
<compilerarg value="-Acom.google.java.contract.debug"/>
|
||||
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
|
||||
-->
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.scala.compile" depends="test.java.compile,scala.compile" if="scala.include">
|
||||
<echo message="Scala: Compiling test cases!"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src path="${scala.public.test.sources}" />
|
||||
<src path="${scala.private.test.sources}" />
|
||||
<include name="**/*.scala"/>
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<pathelement location="${scala.test.classes}"/>
|
||||
<pathelement location="${java.test.classes}"/>
|
||||
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
|
||||
|
||||
<!-- new scala target -->
|
||||
|
||||
<target name="scala" description="build the scala directory">
|
||||
|
|
@ -769,20 +730,113 @@
|
|||
<!-- ***************************************************************************** -->
|
||||
<!-- where to put reports and tests-->
|
||||
<property name="report" value="${build.dir}/report"/>
|
||||
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
|
||||
<property name="test.output" value="${dist.dir}/test"/>
|
||||
<property name="java.public.test.sources" value="public/java/test"/>
|
||||
<property name="java.private.test.sources" value="private/java/test"/>
|
||||
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
|
||||
<property name="java.public.test.classes" value="${java.test.classes}/public"/>
|
||||
<property name="java.private.test.classes" value="${java.test.classes}/private"/>
|
||||
<property name="java.public.test.sources" value="${public.dir}/java/test"/>
|
||||
<property name="java.private.test.sources" value="${private.dir}/java/test"/>
|
||||
<property name="scala.test.classes" value="${build.dir}/scala/testclasses"/>
|
||||
<property name="scala.public.test.sources" value="public/scala/test"/>
|
||||
<property name="scala.private.test.sources" value="private/scala/test"/>
|
||||
<property name="scala.public.test.classes" value="${scala.test.classes}/public"/>
|
||||
<property name="scala.private.test.classes" value="${scala.test.classes}/private"/>
|
||||
<property name="scala.public.test.sources" value="${public.dir}/scala/test"/>
|
||||
<property name="scala.private.test.sources" value="${private.dir}/scala/test"/>
|
||||
<property name="testng.jar" value="${lib.dir}/testng-5.14.1.jar"/>
|
||||
<!-- provide a ceiling on the memory that unit/integration tests can consume. -->
|
||||
<property name="test.maxmemory" value="4g"/>
|
||||
|
||||
<target name="test.init.compile">
|
||||
<mkdir dir="${java.test.classes}"/>
|
||||
<mkdir dir="${scala.test.classes}"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.java.public.compile" depends="dist,test.init.compile">
|
||||
<mkdir dir="${java.public.test.classes}"/>
|
||||
<echo message="Sting: Compiling public test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.public.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src path="${java.public.test.sources}"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
<!--
|
||||
<compilerarg value="-Acom.google.java.contract.debug"/>
|
||||
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
|
||||
-->
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.java.private.compile" depends="dist,test.init.compile,test.java.public.compile" if="include.private">
|
||||
<mkdir dir="${java.private.test.classes}"/>
|
||||
<echo message="Sting: Compiling private test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.private.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src path="${java.private.test.sources}"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
<!--
|
||||
<compilerarg value="-Acom.google.java.contract.debug"/>
|
||||
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
|
||||
-->
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile"/>
|
||||
|
||||
<target name="test.scala.public.compile" depends="test.java.compile,scala.compile" if="scala.include">
|
||||
<mkdir dir="${scala.public.test.classes}"/>
|
||||
<echo message="Scala: Compiling public test cases!"/>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.public.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src path="${scala.public.test.sources}" />
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
<target name="test.scala.private.compile" depends="test.java.compile,scala.compile,test.scala.public.compile" if="include.scala.private">
|
||||
<mkdir dir="${scala.private.test.classes}"/>
|
||||
<echo message="Scala: Compiling private test cases!"/>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.private.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src path="${scala.private.test.sources}" />
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<pathelement location="${scala.public.test.classes}"/>
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${java.private.test.classes}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
<target name="test.scala.compile" depends="test.scala.public.compile,test.scala.private.compile"/>
|
||||
|
||||
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
|
||||
|
||||
<!-- TEST -->
|
||||
<macrodef name="run-test">
|
||||
<attribute name="testtype"/>
|
||||
<attribute name="outputdir"/>
|
||||
<attribute name="runfailed"/>
|
||||
|
||||
<sequential>
|
||||
<condition property="run.failed.tests">
|
||||
<equals arg1="@{runfailed}" arg2="true"/>
|
||||
</condition>
|
||||
|
||||
<!-- Get the pipeline run type. Default to dry. -->
|
||||
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
|
||||
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
|
||||
|
|
@ -792,10 +846,10 @@
|
|||
<isset property="include.contracts" />
|
||||
</condition>
|
||||
|
||||
<mkdir dir="${report}/@{testtype}"/>
|
||||
<mkdir dir="@{outputdir}"/>
|
||||
<echo message="Sting: Running @{testtype} test cases!"/>
|
||||
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
|
||||
<testng outputDir="${report}/@{testtype}"
|
||||
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
|
||||
<testng outputDir="@{outputdir}"
|
||||
haltOnFailure="false" failureProperty="test.failure"
|
||||
verbose="2"
|
||||
workingDir="${basedir}"
|
||||
|
|
@ -813,117 +867,108 @@
|
|||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<pathelement location="${java.contracts}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
<pathelement location="${java.public.test.classes}" />
|
||||
<pathelement location="${java.private.test.classes}" />
|
||||
<pathelement location="${scala.public.test.classes}" />
|
||||
<pathelement location="${scala.private.test.classes}" />
|
||||
</classpath>
|
||||
|
||||
<classfileset dir="${java.test.classes}" includes="**/@{testtype}.class"/>
|
||||
<classfileset dir="${scala.test.classes}" includes="**/@{testtype}*.class" />
|
||||
<classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
|
||||
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}.class" if="include.private"/>
|
||||
</classfileset>
|
||||
<classfileset dir="${scala.public.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}*.class" if="scala.include"/>
|
||||
</classfileset>
|
||||
<classfileset dir="${scala.private.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}*.class" if="include.scala.private"/>
|
||||
</classfileset>
|
||||
|
||||
<xmlfileset dir="${basedir}">
|
||||
<include name="@{testtype}" if="run.failed.tests"/>
|
||||
</xmlfileset>
|
||||
</testng>
|
||||
|
||||
<!-- generate a report for Bamboo or Hudson to read in -->
|
||||
<junitreport todir="${report}/@{testtype}">
|
||||
<fileset dir="${report}/@{testtype}">
|
||||
<junitreport todir="@{outputdir}">
|
||||
<fileset dir="@{outputdir}">
|
||||
<include name="*/*.xml"/>
|
||||
</fileset>
|
||||
<report format="noframes" todir="${report}/@{testtype}"/>
|
||||
<report format="noframes" todir="@{outputdir}"/>
|
||||
</junitreport>
|
||||
<fail message="test failed" if="test.failure" />
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<!-- FAILED-TEST -->
|
||||
<macrodef name="run-failed-test">
|
||||
<attribute name="xmlfailedtestfile" />
|
||||
<sequential>
|
||||
<!-- Get the pipeline run type. Default to dry. -->
|
||||
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
|
||||
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
|
||||
</condition>
|
||||
|
||||
<condition property="cofoja.jvm.args" value="-javaagent:${cofoja.jar} -Dcom.google.java.contract.log.contract=false" else="">
|
||||
<isset property="include.contracts" />
|
||||
</condition>
|
||||
|
||||
<mkdir dir="${report}/failed_rerun" />
|
||||
<echo message="Sting: Running @{xmlfailedtestfile} test cases!"/>
|
||||
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
|
||||
<testng outputDir="${report}/failed_rerun"
|
||||
haltOnFailure="false" failureProperty="test.failure"
|
||||
verbose="2"
|
||||
workingDir="${basedir}"
|
||||
useDefaultListeners="false"
|
||||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter">
|
||||
<jvmarg value="-Xmx${test.maxmemory}" />
|
||||
<jvmarg value="-Djava.awt.headless=true" />
|
||||
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
|
||||
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
|
||||
<jvmarg line="${cofoja.jvm.args}"/>
|
||||
<!-- <jvmarg value="-Xdebug"/> -->
|
||||
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<pathelement location="${java.contracts}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
</classpath>
|
||||
|
||||
<xmlfileset dir="${basedir}" includes="@{xmlfailedtestfile}" />
|
||||
</testng>
|
||||
|
||||
<fail message="test failed" if="test.failure" />
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<!-- our three different test conditions: Test, IntegrationTest, PerformanceTest -->
|
||||
<target name="test" depends="test.compile" description="Run unit tests">
|
||||
<target name="alltests">
|
||||
<antcall target="test" inheritAll="false"/>
|
||||
<antcall target="integrationtest" inheritAll="false"/>
|
||||
<antcall target="pipelinetest" inheritAll="false"/>
|
||||
</target>
|
||||
|
||||
<target name="alltests.public">
|
||||
<antcall target="test.public" inheritAll="false"/>
|
||||
<antcall target="integrationtest.public" inheritAll="false"/>
|
||||
<antcall target="pipelinetest.public" inheritAll="false"/>
|
||||
</target>
|
||||
|
||||
<!-- Our four different test conditions: Test, IntegrationTest, PerformanceTest, PipelineTest -->
|
||||
<target name="test" depends="init.buildall,test.compile" description="Run unit tests">
|
||||
<condition property="ttype" value="*UnitTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${ttype}"/>
|
||||
<run-test testtype="${ttype}" outputdir="${report}/${ttype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="integrationtest" depends="test.compile" description="Run integration tests">
|
||||
<target name="test.public" depends="init.buildpublic,test"/>
|
||||
|
||||
<target name="integrationtest" depends="init.buildall,test.compile" description="Run integration tests">
|
||||
<condition property="itype" value="*IntegrationTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${itype}"/>
|
||||
<run-test testtype="${itype}" outputdir="${report}/${itype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="performancetest" depends="test.compile" description="Run performance tests">
|
||||
<target name="integrationtest.public" depends="init.buildpublic,integrationtest"/>
|
||||
|
||||
<target name="performancetest" depends="init.buildall,test.compile" description="Run performance tests">
|
||||
<condition property="ptype" value="*PerformanceTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${ptype}"/>
|
||||
<run-test testtype="${ptype}" outputdir="${report}/${ptype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="pipelinetest" depends="test.compile" description="Run pipeline tests">
|
||||
<target name="performancetest.public" depends="init.buildpublic,performancetest" />
|
||||
|
||||
<target name="pipelinetest" depends="init.buildall,test.compile" description="Run pipeline tests">
|
||||
<condition property="pipetype" value="*PipelineTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${pipetype}"/>
|
||||
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="pipelinetestrun" depends="test.compile" description="Run pipeline tests">
|
||||
<target name="pipelinetest.public" depends="init.buildpublic,pipelinetest" />
|
||||
|
||||
<target name="pipelinetestrun" depends="init.buildall,test.compile" description="Run pipeline tests">
|
||||
<property name="pipeline.run" value="run"/>
|
||||
<condition property="pipetype" value="*PipelineTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${pipetype}"/>
|
||||
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
|
||||
</target>
|
||||
<target name="pipelinetestrun.public" depends="init.buildpublic,pipelinetestrun" />
|
||||
|
||||
<target name="failed-test" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*UnitTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-test" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*UnitTest/testng-failed.xml" />
|
||||
<target name="failed-integration" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*IntegrationTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-integration" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*IntegrationTest/testng-failed.xml" />
|
||||
<target name="failed-performance" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*PerformanceTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-performance" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*PerformanceTest/testng-failed.xml" />
|
||||
</target>
|
||||
|
||||
<target name="failed-pipeline" depends="test.compile">
|
||||
<run-failed-test xmlfailedtestfile="${report}/*PipelineTest/testng-failed.xml" />
|
||||
<target name="failed-pipeline" depends="init.buildall,test.compile">
|
||||
<run-test testtype="${report}/*PipelineTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
||||
<!-- ******************************************************************************** -->
|
||||
|
|
|
|||
6
ivy.xml
6
ivy.xml
|
|
@ -15,10 +15,8 @@
|
|||
<!-- Tribble -->
|
||||
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
|
||||
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15">
|
||||
<!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
|
||||
<exclude org="javax.mail" />
|
||||
</dependency>
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
<dependency org="jboss" name="javassist" rev="3.7.ga"/>
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
|
||||
|
|
|
|||
|
|
@ -12,14 +12,14 @@ if ( onCMDLine ) {
|
|||
inputFileName = args[1]
|
||||
outputPDF = args[2]
|
||||
} else {
|
||||
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
|
||||
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
||||
inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
|
||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
|
||||
outputPDF = NA
|
||||
}
|
||||
|
||||
RUNTIME_UNITS = "(sec)"
|
||||
ORIGINAL_UNITS_TO_SECONDS = 1/1000
|
||||
RUNTIME_UNITS = "(hours)"
|
||||
ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
|
||||
|
||||
#
|
||||
# Helper function to aggregate all of the jobs in the report across all tables
|
||||
|
|
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
|
|||
#
|
||||
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
|
||||
#
|
||||
plotJobsGantt <- function(gatkReport, sortOverall) {
|
||||
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
|
||||
allJobs = allJobsFromReport(gatkReport)
|
||||
if ( sortOverall ) {
|
||||
title = "All jobs, by analysis, by start time"
|
||||
|
|
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
|
|||
}
|
||||
allJobs$index = 1:nrow(allJobs)
|
||||
minTime = min(allJobs$startTime)
|
||||
allJobs$relStartTime = allJobs$startTime - minTime
|
||||
allJobs$relDoneTime = allJobs$doneTime - minTime
|
||||
allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
|
||||
allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
|
||||
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
|
||||
maxRelTime = max(allJobs$relDoneTime)
|
||||
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
|
||||
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
|
||||
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
|
||||
p <- p + theme_bw()
|
||||
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
|
||||
if ( includeText )
|
||||
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
|
||||
p <- p + xlim(0, maxRelTime * 1.1)
|
||||
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
|
||||
p <- p + ylab("Job")
|
||||
p <- p + ylab("Job number")
|
||||
p <- p + opts(title=title)
|
||||
print(p)
|
||||
}
|
||||
|
|
@ -140,6 +142,8 @@ print(paste("Project :", inputFileName))
|
|||
convertUnits <- function(gatkReportData) {
|
||||
convertGroup <- function(g) {
|
||||
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
|
||||
g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
|
||||
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
|
||||
g
|
||||
}
|
||||
lapply(gatkReportData, convertGroup)
|
||||
|
|
@ -155,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
|
|||
pdf(outputPDF, height=8.5, width=11)
|
||||
}
|
||||
|
||||
plotJobsGantt(gatkReportData, T)
|
||||
plotJobsGantt(gatkReportData, F)
|
||||
plotJobsGantt(gatkReportData, T, F)
|
||||
plotJobsGantt(gatkReportData, F, F)
|
||||
plotProgressByTime(gatkReportData)
|
||||
for ( group in gatkReportData ) {
|
||||
plotGroup(group)
|
||||
|
|
|
|||
|
|
@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
}
|
||||
|
||||
if ( tribbleType == null )
|
||||
if ( ! file.canRead() | !! file.isFile() ) {
|
||||
if ( ! file.canRead() | ! file.isFile() ) {
|
||||
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
|
||||
} else {
|
||||
throw new UserException.CommandLineException(
|
||||
|
|
|
|||
|
|
@ -929,6 +929,14 @@ public class GenomeAnalysisEngine {
|
|||
return readsDataSource.getHeader(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary for this GATK engine instance
|
||||
* @return a never-null dictionary listing all of the contigs known to this engine instance
|
||||
*/
|
||||
public SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return getReferenceDataSource().getReference().getSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source object encapsulating all essential info and handlers used to traverse
|
||||
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
|
|
@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
|
||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
|
||||
currentLocus = null;
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -215,6 +215,45 @@ public class GATKBAMIndex {
|
|||
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use to get close to the unmapped reads at the end of a BAM file.
|
||||
* @return The file offset of the first record in the last linear bin, or -1
|
||||
* if there are no elements in linear bins (i.e. no mapped reads).
|
||||
*/
|
||||
public long getStartOfLastLinearBin() {
|
||||
openIndexFile();
|
||||
|
||||
seek(4);
|
||||
|
||||
final int sequenceCount = readInteger();
|
||||
// Because no reads may align to the last sequence in the sequence dictionary,
|
||||
// grab the last element of the linear index for each sequence, and return
|
||||
// the last one from the last sequence that has one.
|
||||
long lastLinearIndexPointer = -1;
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
// System.out.println("# Sequence TID: " + i);
|
||||
final int nBins = readInteger();
|
||||
// System.out.println("# nBins: " + nBins);
|
||||
for (int j1 = 0; j1 < nBins; j1++) {
|
||||
// Skip bin #
|
||||
skipBytes(4);
|
||||
final int nChunks = readInteger();
|
||||
// Skip chunks
|
||||
skipBytes(16 * nChunks);
|
||||
}
|
||||
final int nLinearBins = readInteger();
|
||||
if (nLinearBins > 0) {
|
||||
// Skip to last element of list of linear bins
|
||||
skipBytes(8 * (nLinearBins - 1));
|
||||
lastLinearIndexPointer = readLongs(1)[0];
|
||||
}
|
||||
}
|
||||
|
||||
closeIndexFile();
|
||||
|
||||
return lastLinearIndexPointer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the possible number of bins for a given reference sequence.
|
||||
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
|
||||
|
|
|
|||
|
|
@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
// If the region contains location information (in other words, it is not at
|
||||
// the start of the unmapped region), add the region.
|
||||
if(currentFilePointer.isRegionUnmapped) {
|
||||
// If the region is unmapped and no location data exists, add a null as an indicator to
|
||||
// start at the next unmapped region.
|
||||
if(!isIntoUnmappedRegion) {
|
||||
selectedReaders.put(id,null);
|
||||
isIntoUnmappedRegion = true;
|
||||
}
|
||||
else
|
||||
selectedReaders.put(id,position.get(id));
|
||||
}
|
||||
else {
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
|
|
|
|||
|
|
@ -97,7 +97,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
if (!( walker instanceof TreeReducible ))
|
||||
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
|
||||
|
||||
traversalEngine.startTimers();
|
||||
ReduceTree reduceTree = new ReduceTree(this);
|
||||
initializeWalker(walker);
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,6 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
* @param shardStrategy A strategy for sharding the data.
|
||||
*/
|
||||
public Object execute(Walker walker, ShardStrategy shardStrategy) {
|
||||
traversalEngine.startTimers();
|
||||
walker.initialize();
|
||||
Accumulator accumulator = Accumulator.create(engine,walker);
|
||||
|
||||
|
|
@ -54,6 +53,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
if ( done || shard == null ) // we ran out of shards that aren't owned
|
||||
break;
|
||||
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
if(shard.getShardType() == Shard.ShardType.LOCUS) {
|
||||
LocusWalker lWalker = (LocusWalker)walker;
|
||||
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata());
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ public class ShardTraverser implements Callable {
|
|||
|
||||
public Object call() {
|
||||
try {
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
Object accumulator = walker.reduceInit();
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class PlatformFilter extends ReadFilter {
|
||||
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false)
|
||||
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false)
|
||||
protected String[] PLFilterNames;
|
||||
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
|||
else if ( stub.getOutputStream() != null ) {
|
||||
this.file = null;
|
||||
this.stream = stub.getOutputStream();
|
||||
writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
|
||||
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
|
||||
|
|
@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
|||
}
|
||||
|
||||
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
|
||||
return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.io.stubs;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.gatk.CommandLineExecutable;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
return isCompressed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary from the engine associated with this stub
|
||||
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
|
||||
* @return
|
||||
*/
|
||||
public SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return engine.getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we tell the VCF writer not to write genotypes?
|
||||
* @return true if the writer should not write genotypes.
|
||||
|
|
|
|||
|
|
@ -293,15 +293,16 @@ public class GATKRunReport {
|
|||
* That is, postReport() is guarenteed not to fail for any reason.
|
||||
*/
|
||||
private File postReportToLocalDisk(File rootDir) {
|
||||
String filename = getID() + ".report.xml.gz";
|
||||
File file = new File(rootDir, filename);
|
||||
try {
|
||||
String filename = getID() + ".report.xml.gz";
|
||||
File file = new File(rootDir, filename);
|
||||
postReportToFile(file);
|
||||
logger.debug("Wrote report to " + file);
|
||||
return file;
|
||||
} catch ( Exception e ) {
|
||||
// we catch everything, and no matter what eat the error
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
file.delete();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -312,6 +313,7 @@ public class GATKRunReport {
|
|||
File localFile = postReportToLocalDisk(new File("./"));
|
||||
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
|
||||
if ( localFile != null ) { // we succeeded in creating the local file
|
||||
localFile.deleteOnExit();
|
||||
try {
|
||||
// stop us from printing the annoying, and meaningless, mime types warning
|
||||
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
||||
|
|
@ -336,14 +338,13 @@ public class GATKRunReport {
|
|||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||
logger.debug("Uploaded to AWS: " + s3Object);
|
||||
logger.info("Uploaded run statistics report to AWS S3");
|
||||
} catch ( S3ServiceException e ) {
|
||||
exceptDuringRunReport("S3 exception occurred", e);
|
||||
} catch ( NoSuchAlgorithmException e ) {
|
||||
exceptDuringRunReport("Couldn't calculate MD5", e);
|
||||
} catch ( IOException e ) {
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
} finally {
|
||||
localFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram {
|
|||
Index index = IndexFactory.createIndex(inputFileSource, codec, approach);
|
||||
|
||||
// add writing of the sequence dictionary, if supplied
|
||||
builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false);
|
||||
builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary());
|
||||
|
||||
// create the output stream, and write the index
|
||||
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Utilities for working with Sequence Dictionaries embedded in tribble indices
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class IndexDictionaryUtils {
|
||||
private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class);
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
|
||||
* @param index the index file to use
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
|
||||
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
|
||||
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
|
||||
Integer.valueOf(entry.getValue())));
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence dictionary with the contig list; a backup approach
|
||||
* @param index the index file to use
|
||||
* @param dict the sequence dictionary to add contigs to
|
||||
* @return the filled-in sequence dictionary
|
||||
*/
|
||||
static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
|
||||
LinkedHashSet<String> seqNames = index.getSequenceNames();
|
||||
if (seqNames == null) {
|
||||
return dict;
|
||||
}
|
||||
for (String name : seqNames) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
|
||||
for ( SAMSequenceRecord seq : dict.getSequences() ) {
|
||||
final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName();
|
||||
final String length = String.valueOf(seq.getSequenceLength());
|
||||
index.addProperty(contig,length);
|
||||
}
|
||||
}
|
||||
|
||||
public static void validateTrackSequenceDictionary(final String trackName,
|
||||
final SAMSequenceDictionary trackDict,
|
||||
final SAMSequenceDictionary referenceDict,
|
||||
final ValidationExclusion.TYPE validationExclusionType ) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
|
|
@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* @author aaron
|
||||
* `
|
||||
* Class RMDTrackBuilder
|
||||
*
|
||||
|
|
@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
|
||||
public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false;
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
// private sequence dictionary we use to set our tracks with
|
||||
private SAMSequenceDictionary dict = null;
|
||||
|
||||
|
|
@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); }
|
||||
catch (ReviewedStingException e) { }
|
||||
|
||||
sequenceDictionary = getSequenceDictionaryFromProperties(index);
|
||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
|
||||
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
|
||||
if (sequenceDictionary.size() == 0 && dict != null) {
|
||||
File indexFile = Tribble.indexFile(inputFile);
|
||||
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
|
||||
sequenceDictionary = getSequenceDictionaryFromProperties(index);
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
|
||||
try { // re-write the index
|
||||
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
|
||||
} catch (IOException e) {
|
||||
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
|
||||
}
|
||||
|
||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
}
|
||||
|
||||
if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE )
|
||||
|
|
@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
// this can take a while, let them know what we're doing
|
||||
logger.info("Creating Tribble index in memory for file " + inputFile);
|
||||
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict);
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
// static functions to work with the sequence dictionaries of indexes
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
|
||||
* @param index the index file to use
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
|
||||
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
|
||||
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
|
||||
Integer.valueOf(entry.getValue())));
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence dictionary with the contig list; a backup approach
|
||||
* @param index the index file to use
|
||||
* @param dict the sequence dictionary to add contigs to
|
||||
* @return the filled-in sequence dictionary
|
||||
*/
|
||||
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
|
||||
LinkedHashSet<String> seqNames = index.getSequenceNames();
|
||||
if (seqNames == null) {
|
||||
return dict;
|
||||
}
|
||||
for (String name : seqNames) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
|
||||
* (that each contig in the index is in the sequence dictionary).
|
||||
* @param inputFile for proper error message formatting.
|
||||
* @param dict the sequence dictionary
|
||||
* @param index the index file
|
||||
* @param indexFile the index file
|
||||
* @param rewriteIndex should we rewrite the index when we're done?
|
||||
*
|
||||
*/
|
||||
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
|
||||
if (dict == null) return;
|
||||
|
||||
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
|
||||
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
|
||||
public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) {
|
||||
if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null");
|
||||
|
||||
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
|
||||
for (SAMSequenceRecord seq : currentDict.getSequences()) {
|
||||
if (dict.getSequence(seq.getSequenceName()) == null)
|
||||
continue;
|
||||
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
|
||||
}
|
||||
// re-write the index
|
||||
if (rewriteIndex) try {
|
||||
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
|
||||
} catch (IOException e) {
|
||||
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
|
||||
}
|
||||
final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
|
||||
validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict);
|
||||
|
||||
// actually update the dictionary in the index
|
||||
IndexDictionaryUtils.setIndexSequenceDictionary(index, dict);
|
||||
}
|
||||
|
||||
|
||||
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
}
|
||||
public void validateTrackSequenceDictionary(final String trackName,
|
||||
final SAMSequenceDictionary trackDict,
|
||||
final SAMSequenceDictionary referenceDict ) {
|
||||
IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -115,12 +115,13 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
LinkedList<ProcessingHistory> history = new LinkedList<ProcessingHistory>();
|
||||
|
||||
/** We use the SimpleTimer to time our run */
|
||||
private SimpleTimer timer = new SimpleTimer("Traversal");
|
||||
private SimpleTimer timer = null;
|
||||
|
||||
// How long can we go without printing some progress info?
|
||||
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
|
||||
private int printProgressCheckCounter = 0;
|
||||
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
|
||||
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds
|
||||
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
||||
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
|
||||
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
|
||||
|
|
@ -209,11 +210,16 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
}
|
||||
}
|
||||
/**
|
||||
* Should be called to indicate that we're going to process records and the timer should start ticking
|
||||
* Should be called to indicate that we're going to process records and the timer should start ticking. This
|
||||
* function should be called right before any traversal work is done, to avoid counting setup costs in the
|
||||
* processing costs and inflating the estimated runtime.
|
||||
*/
|
||||
public void startTimers() {
|
||||
timer.start();
|
||||
lastProgressPrintTime = timer.currentTime();
|
||||
public void startTimersIfNecessary() {
|
||||
if ( timer == null ) {
|
||||
timer = new SimpleTimer("Traversal");
|
||||
timer.start();
|
||||
lastProgressPrintTime = timer.currentTime();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -224,7 +230,8 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
* @return true if the maximum interval (in millisecs) has passed since the last printing
|
||||
*/
|
||||
private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) {
|
||||
return (curTime - lastPrintTime) > printFreq;
|
||||
long elapsed = curTime - lastPrintTime;
|
||||
return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -351,7 +358,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
public void printOnTraversalDone() {
|
||||
printProgress(null, null, true);
|
||||
|
||||
final double elapsed = timer.getElapsedTime();
|
||||
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
|
||||
|
||||
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics();
|
||||
|
||||
|
|
|
|||
|
|
@ -26,21 +26,23 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
|
||||
* of the given ROD.
|
||||
*/
|
||||
public class PrintRODsWalker extends RodWalker<Integer, Integer> {
|
||||
@Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true)
|
||||
public RodBinding<Feature> input;
|
||||
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
|
|
@ -62,7 +64,7 @@ public class PrintRODsWalker extends RodWalker<Integer, Integer> {
|
|||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
for ( Feature feature : tracker.getValues(Feature.class) ) {
|
||||
for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) {
|
||||
out.println(feature.toString());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
|||
* -I input1.bam \
|
||||
* -I input2.bam \
|
||||
* --read_filter MappingQualityZero
|
||||
*
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T PrintReads \
|
||||
* -o output.bam \
|
||||
* -I input.bam \
|
||||
* -n 2000
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -77,6 +78,15 @@ public abstract class Walker<MapType, ReduceType> {
|
|||
return toolkit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary for this walker
|
||||
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
|
||||
* @return
|
||||
*/
|
||||
protected SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return getToolkit().getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* (conceptual static) method that states whether you want to see reads piling up at a locus
|
||||
* that contain a deletion at the locus.
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ public class SBByDepth extends AnnotationByDepth {
|
|||
if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY))
|
||||
return null;
|
||||
|
||||
double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY));
|
||||
double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1);
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
|
|
|
|||
|
|
@ -24,7 +24,9 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -46,134 +45,429 @@ import java.util.*;
|
|||
* (http://snpeff.sourceforge.net/).
|
||||
*
|
||||
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
|
||||
* output file (which must be provided on the command line via --snpEffFile:SnpEff <filename>),
|
||||
* output file (which must be provided on the command line via --snpEffFile filename.vcf),
|
||||
* and adds annotations on that effect.
|
||||
*
|
||||
* The possible biological effects and their associated impacts are defined in the class:
|
||||
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
// SnpEff annotation key names:
|
||||
public static final String GENE_ID_KEY = "GENE_ID";
|
||||
public static final String GENE_NAME_KEY = "GENE_NAME";
|
||||
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
|
||||
public static final String EXON_ID_KEY = "EXON_ID";
|
||||
public static final String EXON_RANK_KEY = "EXON_RANK";
|
||||
public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
|
||||
public static final String EFFECT_KEY = "EFFECT";
|
||||
public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
|
||||
public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
|
||||
public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
|
||||
public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
|
||||
public static final String CODON_NUM_KEY = "CODON_NUM";
|
||||
public static final String CDS_SIZE_KEY = "CDS_SIZE";
|
||||
private static Logger logger = Logger.getLogger(SnpEff.class);
|
||||
|
||||
// We refuse to parse SnpEff output files generated by unsupported versions, or
|
||||
// lacking a SnpEff version number in the VCF header:
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
|
||||
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
|
||||
|
||||
// SnpEff aggregates all effects (and effect metadata) together into a single INFO
|
||||
// field annotation with the key EFF:
|
||||
public static final String SNPEFF_INFO_FIELD_KEY = "EFF";
|
||||
public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]";
|
||||
public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|";
|
||||
|
||||
// Key names for the INFO field annotations we will add to each record, along
|
||||
// with parsing-related information:
|
||||
public enum InfoFieldKey {
|
||||
EFFECT_KEY ("SNPEFF_EFFECT", -1),
|
||||
IMPACT_KEY ("SNPEFF_IMPACT", 0),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 7);
|
||||
|
||||
// Actual text of the key
|
||||
private final String keyName;
|
||||
|
||||
// Index within the effect metadata subfields from the SnpEff EFF annotation
|
||||
// where each key's associated value can be found during parsing.
|
||||
private final int fieldIndex;
|
||||
|
||||
InfoFieldKey ( String keyName, int fieldIndex ) {
|
||||
this.keyName = keyName;
|
||||
this.fieldIndex = fieldIndex;
|
||||
}
|
||||
|
||||
public String getKeyName() {
|
||||
return keyName;
|
||||
}
|
||||
|
||||
public int getFieldIndex() {
|
||||
return fieldIndex;
|
||||
}
|
||||
}
|
||||
|
||||
// Possible SnpEff biological effects. All effect names found in the SnpEff input file
|
||||
// are validated against this list.
|
||||
public enum EffectType {
|
||||
NONE,
|
||||
CHROMOSOME,
|
||||
INTERGENIC,
|
||||
UPSTREAM,
|
||||
UTR_5_PRIME,
|
||||
UTR_5_DELETED,
|
||||
START_GAINED,
|
||||
SPLICE_SITE_ACCEPTOR,
|
||||
SPLICE_SITE_DONOR,
|
||||
START_LOST,
|
||||
SYNONYMOUS_START,
|
||||
NON_SYNONYMOUS_START,
|
||||
CDS,
|
||||
GENE,
|
||||
TRANSCRIPT,
|
||||
EXON,
|
||||
EXON_DELETED,
|
||||
NON_SYNONYMOUS_CODING,
|
||||
SYNONYMOUS_CODING,
|
||||
FRAME_SHIFT,
|
||||
CODON_CHANGE,
|
||||
CODON_INSERTION,
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION,
|
||||
CODON_DELETION,
|
||||
CODON_CHANGE_PLUS_CODON_DELETION,
|
||||
STOP_GAINED,
|
||||
SYNONYMOUS_STOP,
|
||||
NON_SYNONYMOUS_STOP,
|
||||
STOP_LOST,
|
||||
INTRON,
|
||||
UTR_3_PRIME,
|
||||
UTR_3_DELETED,
|
||||
DOWNSTREAM,
|
||||
INTRON_CONSERVED,
|
||||
INTERGENIC_CONSERVED,
|
||||
REGULATION,
|
||||
CUSTOM,
|
||||
WITHIN_NON_CODING_GENE
|
||||
}
|
||||
|
||||
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact.
|
||||
public enum EffectImpact {
|
||||
LOW (1),
|
||||
MODERATE (2),
|
||||
HIGH (3);
|
||||
|
||||
private final int severityRating;
|
||||
|
||||
EffectImpact ( int severityRating ) {
|
||||
this.severityRating = severityRating;
|
||||
}
|
||||
|
||||
public boolean isHigherImpactThan ( EffectImpact other ) {
|
||||
return this.severityRating > other.severityRating;
|
||||
}
|
||||
}
|
||||
|
||||
// SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information.
|
||||
public enum EffectCoding {
|
||||
CODING,
|
||||
NON_CODING,
|
||||
UNKNOWN
|
||||
}
|
||||
|
||||
|
||||
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
|
||||
validateRodBinding(walker.getSnpEffRodBinding());
|
||||
checkSnpEffVersion(walker, toolkit);
|
||||
}
|
||||
|
||||
public Map<String, Object> annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
|
||||
RodBinding<SnpEffFeature> snpEffRodBinding = walker.getSnpEffRodBinding();
|
||||
validateRodBinding(snpEffRodBinding);
|
||||
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
|
||||
|
||||
List<SnpEffFeature> features = tracker.getValues(snpEffRodBinding, ref.getLocus());
|
||||
// Get only SnpEff records that start at this locus, not merely span it:
|
||||
List<VariantContext> snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus());
|
||||
|
||||
// Add only annotations for one of the most biologically-significant effects as defined in
|
||||
// the SnpEffConstants class:
|
||||
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features);
|
||||
|
||||
if ( mostSignificantEffect == null ) {
|
||||
// Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate.
|
||||
// If there is more than one such record, we only need to pick the first one, since the biological
|
||||
// effects will be the same across all such records:
|
||||
VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc);
|
||||
if ( matchingRecord == null ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return generateAnnotations(mostSignificantEffect);
|
||||
// Parse the SnpEff INFO field annotation from the matching record into individual effect objects:
|
||||
List<SnpEffEffect> effects = parseSnpEffRecord(matchingRecord);
|
||||
if ( effects.size() == 0 ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Add only annotations for one of the most biologically-significant effects from this set:
|
||||
SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects);
|
||||
return mostSignificantEffect.getAnnotations();
|
||||
}
|
||||
|
||||
private void validateRodBinding ( RodBinding<SnpEffFeature> snpEffRodBinding ) {
|
||||
private void validateRodBinding ( RodBinding<VariantContext> snpEffRodBinding ) {
|
||||
if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) {
|
||||
throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " +
|
||||
"as a rodbinding on the command line, but no SnpEff rodbinding was found.");
|
||||
throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " +
|
||||
"as a rodbinding on the command line via the --snpEffFile option, but " +
|
||||
"no SnpEff rodbinding was found.");
|
||||
}
|
||||
}
|
||||
|
||||
private SnpEffFeature getMostSignificantEffect ( List<SnpEffFeature> snpEffFeatures ) {
|
||||
SnpEffFeature mostSignificantEffect = null;
|
||||
private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
|
||||
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
|
||||
|
||||
for ( SnpEffFeature snpEffFeature : snpEffFeatures ) {
|
||||
VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName());
|
||||
VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY);
|
||||
|
||||
if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) {
|
||||
throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " +
|
||||
"input file, and so could not verify that the file was generated by a supported version of SnpEff (" +
|
||||
Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
|
||||
}
|
||||
|
||||
String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0];
|
||||
|
||||
if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) {
|
||||
throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " +
|
||||
"is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSupportedSnpEffVersion ( String versionString ) {
|
||||
for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) {
|
||||
if ( supportedVersion.equals(versionString) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private VariantContext getMatchingSnpEffRecord ( List<VariantContext> snpEffRecords, VariantContext vc ) {
|
||||
for ( VariantContext snpEffRecord : snpEffRecords ) {
|
||||
if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) {
|
||||
return snpEffRecord;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<SnpEffEffect> parseSnpEffRecord ( VariantContext snpEffRecord ) {
|
||||
List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>();
|
||||
|
||||
Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
|
||||
List<String> individualEffects;
|
||||
|
||||
// The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a String.
|
||||
// We can have either in the case of SnpEff, since there may be one or more than one effect in this record.
|
||||
if ( effectFieldValue instanceof List ) {
|
||||
individualEffects = (List<String>)effectFieldValue;
|
||||
}
|
||||
else {
|
||||
individualEffects = Arrays.asList((String)effectFieldValue);
|
||||
}
|
||||
|
||||
for ( String effectString : individualEffects ) {
|
||||
String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);
|
||||
|
||||
if ( effectNameAndMetadata.length != 2 ) {
|
||||
logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s",
|
||||
snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
|
||||
continue;
|
||||
}
|
||||
|
||||
String effectName = effectNameAndMetadata[0];
|
||||
String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);
|
||||
|
||||
SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);
|
||||
|
||||
if ( parsedEffect.isWellFormed() ) {
|
||||
parsedEffects.add(parsedEffect);
|
||||
}
|
||||
else {
|
||||
logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
|
||||
snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString));
|
||||
}
|
||||
}
|
||||
|
||||
return parsedEffects;
|
||||
}
|
||||
|
||||
private SnpEffEffect getMostSignificantEffect ( List<SnpEffEffect> effects ) {
|
||||
SnpEffEffect mostSignificantEffect = null;
|
||||
|
||||
for ( SnpEffEffect effect : effects ) {
|
||||
if ( mostSignificantEffect == null ||
|
||||
snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
|
||||
effect.isHigherImpactThan(mostSignificantEffect) ) {
|
||||
|
||||
mostSignificantEffect = snpEffFeature;
|
||||
mostSignificantEffect = effect;
|
||||
}
|
||||
}
|
||||
|
||||
return mostSignificantEffect;
|
||||
}
|
||||
|
||||
private Map<String, Object> generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
|
||||
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(getKeyNames().size()));
|
||||
|
||||
if ( mostSignificantEffect.hasGeneID() )
|
||||
annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
|
||||
if ( mostSignificantEffect.hasGeneName() )
|
||||
annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
|
||||
if ( mostSignificantEffect.hasTranscriptID() )
|
||||
annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
|
||||
if ( mostSignificantEffect.hasExonID() )
|
||||
annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
|
||||
if ( mostSignificantEffect.hasExonRank() )
|
||||
annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
|
||||
if ( mostSignificantEffect.isNonCodingGene() )
|
||||
annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
|
||||
|
||||
annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
|
||||
annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
|
||||
if ( mostSignificantEffect.hasEffectExtraInformation() )
|
||||
annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
|
||||
|
||||
if ( mostSignificantEffect.hasOldAndNewAA() )
|
||||
annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
|
||||
if ( mostSignificantEffect.hasOldAndNewCodon() )
|
||||
annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
|
||||
if ( mostSignificantEffect.hasCodonNum() )
|
||||
annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
|
||||
if ( mostSignificantEffect.hasCdsSize() )
|
||||
annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList( GENE_ID_KEY,
|
||||
GENE_NAME_KEY,
|
||||
TRANSCRIPT_ID_KEY,
|
||||
EXON_ID_KEY,
|
||||
EXON_RANK_KEY,
|
||||
WITHIN_NON_CODING_GENE_KEY,
|
||||
EFFECT_KEY,
|
||||
EFFECT_IMPACT_KEY,
|
||||
EFFECT_EXTRA_INFORMATION_KEY,
|
||||
OLD_NEW_AA_KEY,
|
||||
OLD_NEW_CODON_KEY,
|
||||
CODON_NUM_KEY,
|
||||
CDS_SIZE_KEY
|
||||
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
|
||||
InfoFieldKey.IMPACT_KEY.getKeyName(),
|
||||
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
|
||||
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName()
|
||||
);
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(
|
||||
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
|
||||
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class to parse, validate, and store a single SnpEff effect and its metadata.
|
||||
*/
|
||||
protected static class SnpEffEffect {
|
||||
private EffectType effect;
|
||||
private EffectImpact impact;
|
||||
private String codonChange;
|
||||
private String aminoAcidChange;
|
||||
private String geneName;
|
||||
private String geneBiotype;
|
||||
private EffectCoding coding;
|
||||
private String transcriptID;
|
||||
private String exonID;
|
||||
|
||||
private String parseError = null;
|
||||
private boolean isWellFormed = true;
|
||||
|
||||
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
|
||||
|
||||
// Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
|
||||
// errors come after warnings, not vice versa:
|
||||
private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
|
||||
private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
|
||||
|
||||
private static final int SNPEFF_CODING_FIELD_INDEX = 5;
|
||||
|
||||
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
|
||||
parseEffectName(effectName);
|
||||
parseEffectMetadata(effectMetadata);
|
||||
}
|
||||
|
||||
private void parseEffectName ( String effectName ) {
|
||||
try {
|
||||
effect = EffectType.valueOf(effectName);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("%s is not a recognized effect type", effectName));
|
||||
}
|
||||
}
|
||||
|
||||
private void parseEffectMetadata ( String[] effectMetadata ) {
|
||||
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
|
||||
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
|
||||
parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
|
||||
}
|
||||
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
|
||||
}
|
||||
else {
|
||||
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
|
||||
EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
}
|
||||
|
||||
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
|
||||
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
|
||||
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
|
||||
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
|
||||
|
||||
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
|
||||
try {
|
||||
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX]));
|
||||
}
|
||||
}
|
||||
else {
|
||||
coding = EffectCoding.UNKNOWN;
|
||||
}
|
||||
|
||||
transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
|
||||
exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
|
||||
}
|
||||
|
||||
private void parseError ( String message ) {
|
||||
isWellFormed = false;
|
||||
|
||||
// Cache only the first error encountered:
|
||||
if ( parseError == null ) {
|
||||
parseError = message;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isWellFormed() {
|
||||
return isWellFormed;
|
||||
}
|
||||
|
||||
public String getParseError() {
|
||||
return parseError == null ? "" : parseError;
|
||||
}
|
||||
|
||||
public boolean isCoding() {
|
||||
return coding == EffectCoding.CODING;
|
||||
}
|
||||
|
||||
public boolean isHigherImpactThan ( SnpEffEffect other ) {
|
||||
// If one effect is within a coding gene and the other is not, the effect that is
|
||||
// within the coding gene has higher impact:
|
||||
|
||||
if ( isCoding() && ! other.isCoding() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( ! isCoding() && other.isCoding() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Otherwise, both effects are either in or not in a coding gene, so we compare the impacts
|
||||
// of the effects themselves:
|
||||
|
||||
return impact.isHigherImpactThan(other.impact);
|
||||
}
|
||||
|
||||
public Map<String, Object> getAnnotations() {
|
||||
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(InfoFieldKey.values().length));
|
||||
|
||||
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
|
||||
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
|
||||
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
|
||||
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
||||
private void addAnnotation ( Map<String, Object> annotations, String keyName, String keyValue ) {
|
||||
// Only add annotations for keys associated with non-empty values:
|
||||
if ( keyValue != null && keyValue.trim().length() > 0 ) {
|
||||
annotations.put(keyName, keyValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
|
@ -86,14 +85,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
|
||||
|
||||
/**
|
||||
* The INFO field will be annotated with information on the most biologically-significant effect
|
||||
* listed in the SnpEff output file for each variant.
|
||||
*/
|
||||
@Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
|
||||
public RodBinding<SnpEffFeature> snpEffFile;
|
||||
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
|
||||
public RodBinding<VariantContext> snpEffFile;
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return snpEffFile; }
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
|
|
@ -203,11 +203,13 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
}
|
||||
|
||||
if ( USE_ALL_ANNOTATIONS )
|
||||
engine = new VariantAnnotatorEngine(this);
|
||||
engine = new VariantAnnotatorEngine(this, getToolkit());
|
||||
else
|
||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
|
||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit());
|
||||
engine.initializeExpressions(expressionsToUse);
|
||||
|
||||
engine.invokeAnnotationInitializationMethods();
|
||||
|
||||
// setup the header fields
|
||||
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
|
|
|
|||
|
|
@ -26,13 +26,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
|
@ -49,6 +47,7 @@ public class VariantAnnotatorEngine {
|
|||
|
||||
private HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
|
||||
private AnnotatorCompatibleWalker walker;
|
||||
private GenomeAnalysisEngine toolkit;
|
||||
|
||||
private static class VAExpression {
|
||||
|
||||
|
|
@ -74,16 +73,18 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
|
||||
// use this constructor if you want all possible annotations
|
||||
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) {
|
||||
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
|
||||
this.walker = walker;
|
||||
this.toolkit = toolkit;
|
||||
requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
|
||||
requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
|
||||
initializeDBs();
|
||||
}
|
||||
|
||||
// use this constructor if you want to select specific annotations (and/or interfaces)
|
||||
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker) {
|
||||
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
|
||||
this.walker = walker;
|
||||
this.toolkit = toolkit;
|
||||
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||
initializeDBs();
|
||||
}
|
||||
|
|
@ -113,6 +114,16 @@ public class VariantAnnotatorEngine {
|
|||
dbAnnotations.put(rod, rod.getName());
|
||||
}
|
||||
|
||||
public void invokeAnnotationInitializationMethods() {
|
||||
for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
|
||||
annotation.initialize(walker, toolkit);
|
||||
}
|
||||
|
||||
for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
|
||||
annotation.initialize(walker, toolkit);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
|
||||
|
||||
Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
|
||||
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.List;
|
||||
|
|
@ -9,8 +8,9 @@ import java.util.List;
|
|||
public interface AnnotatorCompatibleWalker {
|
||||
|
||||
// getter methods for various used bindings
|
||||
public abstract RodBinding<SnpEffFeature> getSnpEffRodBinding();
|
||||
public abstract RodBinding<VariantContext> getVariantRodBinding();
|
||||
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
|
||||
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
|
||||
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
|
||||
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,18 +24,16 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
|
||||
public abstract class VariantAnnotatorAnnotation {
|
||||
// return the INFO keys
|
||||
public abstract List<String> getKeyNames();
|
||||
|
||||
// initialization method (optional for subclasses, and therefore non-abstract)
|
||||
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { }
|
||||
}
|
||||
|
|
@ -79,24 +79,45 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
/**
|
||||
* If this argument is present, the original allele frequencies and counts from this vcf are added as annotations ACH,AFH and ANH. at each record present in this vcf
|
||||
*/
|
||||
@Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
|
||||
public RodBinding<VariantContext> comp;
|
||||
|
||||
|
||||
/**
|
||||
* This required argument is used to annotate each site in the vcf INFO field with R2 annotation. Will be NaN if Beagle determined there are no variant samples.
|
||||
*/
|
||||
@Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true)
|
||||
public RodBinding<BeagleFeature> beagleR2;
|
||||
|
||||
/**
|
||||
* These values will populate the GL field for each sample and contain the posterior probability of each genotype given the data after phasing and imputation.
|
||||
*/
|
||||
@Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true)
|
||||
public RodBinding<BeagleFeature> beagleProbs;
|
||||
|
||||
/**
|
||||
* By default, all genotypes will be marked in the VCF as "phased", using the "|" separator after Beagle.
|
||||
*/
|
||||
@Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true)
|
||||
public RodBinding<BeagleFeature> beaglePhased;
|
||||
|
||||
@Output(doc="VCF File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfWriter = null;
|
||||
|
||||
/**
|
||||
* If this argument is absent, and if Beagle determines that there is no sample in a site that has a variant genotype, the site will be marked as filtered (Default behavior).
|
||||
* If the argument is present, the site won't be marked as filtered under this condition even if there are no variant genotypes.
|
||||
*/
|
||||
@Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false)
|
||||
public boolean DONT_FILTER_MONOMORPHIC_SITES = false;
|
||||
|
||||
/**
|
||||
* Value between 0 and 1. If the probability of getting a genotype correctly (based on the posterior genotype probabilities and the actual genotype) is below this threshold,
|
||||
* a genotype will be substitute by a no-call.
|
||||
*/
|
||||
@Argument(fullName="no" +
|
||||
"call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false)
|
||||
private double noCallThreshold = 0.0;
|
||||
|
|
@ -154,21 +175,16 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2);
|
||||
// ignore places where we don't have a variant
|
||||
if ( beagleR2Feature == null )
|
||||
return 0;
|
||||
|
||||
|
||||
BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs);
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
if ( beagleProbsFeature == null )
|
||||
return 0;
|
||||
|
||||
BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased);
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
if ( beaglePhasedFeature == null )
|
||||
return 0;
|
||||
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
|
||||
{
|
||||
vcfWriter.add(vc_input);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
// get reference base for current position
|
||||
byte refByte = ref.getBase();
|
||||
|
|
|
|||
|
|
@ -112,6 +112,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
|
||||
VCFWriter bootstrapVCFOutput = null;
|
||||
|
||||
/**
|
||||
* If sample gender is known, this flag should be set to true to ensure that Beagle treats male Chr X properly.
|
||||
*/
|
||||
@Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false)
|
||||
public boolean CHECK_IS_MALE_ON_CHR_X = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -63,20 +63,32 @@ import java.util.*;
|
|||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more bam files (with proper headers) to be analyzed for coverage statistics
|
||||
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
*(Optional) A REFSEQ Rod to aggregate coverage to the gene level
|
||||
* <p>
|
||||
* (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
|
||||
*</p></p>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
|
||||
* </p><p>
|
||||
* - no suffix: per locus coverage
|
||||
* </p><p>
|
||||
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
|
||||
* </p><p>
|
||||
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
|
||||
* </p><p>
|
||||
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
|
||||
* </p><p>
|
||||
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
|
||||
* </p><p>
|
||||
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
|
||||
* </p><p>
|
||||
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
|
||||
* </p><p>
|
||||
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
|
||||
* </p><p>
|
||||
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
|
||||
* </p>
|
||||
*
|
||||
|
|
@ -84,7 +96,7 @@ import java.util.*;
|
|||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantEval \
|
||||
* -T DepthOfCoverage \
|
||||
* -o file_name_base \
|
||||
* -I input_bams.list
|
||||
* [-geneList refSeq.sorted.txt] \
|
||||
|
|
|
|||
|
|
@ -43,8 +43,10 @@ import java.util.List;
|
|||
* Generates an alternative reference sequence over the specified interval.
|
||||
*
|
||||
* <p>
|
||||
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
|
||||
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
|
||||
* Note that if there are multiple variants at a site, it takes the first one seen.
|
||||
* Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
|
|
|
|||
|
|
@ -42,6 +42,9 @@ import java.io.PrintStream;
|
|||
*
|
||||
* <p>
|
||||
* The output format can be partially controlled using the provided command-line arguments.
|
||||
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
|
||||
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
|
||||
* separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
|
|
|
|||
|
|
@ -276,8 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
if ( elt.isReducedRead() ) {
|
||||
// reduced read representation
|
||||
byte qual = elt.getReducedQual();
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getReducedCount(); // we added nObs bases here
|
||||
if ( BaseUtils.isRegularBase( elt.getBase() )) {
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getReducedCount(); // we added nObs bases here
|
||||
} else // odd bases or deletions => don't use them
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
private boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
|
||||
|
||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
|
||||
final private ExactCalculation calcToUse;
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
|
|
@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
}
|
||||
|
||||
private static final double[][] getGLs(Map<String, Genotype> GLs) {
|
||||
double[][] genotypeLikelihoods = new double[GLs.size()+1][];
|
||||
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
||||
|
||||
int j = 0;
|
||||
//int j = 0;
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.values() ) {
|
||||
j++;
|
||||
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
|
||||
genotypeLikelihoods.add(gls);
|
||||
}
|
||||
}
|
||||
|
||||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Linearized, ~O(N), implementation.
|
||||
|
|
@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
public int linearExact(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
final int numSamples = GLs.size();
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
final double[][] genotypeLikelihoods = getGLs(GLs);
|
||||
|
||||
final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
|
|
@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
final double[] kMinus2 = logY.getkMinus2();
|
||||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods[j];
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
|
|
@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if ( !vc.isVariant() )
|
||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
boolean multiAllelicRecord = false;
|
||||
|
||||
if (vc.getAlternateAlleles().size() > 1)
|
||||
multiAllelicRecord = true;
|
||||
|
||||
Map<String, Genotype> GLs = vc.getGenotypes();
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
|
|
@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
pathMetricArray[0][0] = 0.0;
|
||||
|
||||
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
||||
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
sampleIndices.addAll(GLs.keySet());
|
||||
sampleIdx = GLs.size();
|
||||
}
|
||||
|
|
@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
continue;
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
||||
//System.out.print(sample.getKey()+":");
|
||||
//for (int k=0; k < likelihoods.length; k++)
|
||||
// System.out.format("%4.2f ",likelihoods[k]);
|
||||
//System.out.println();
|
||||
// all likelihoods are essentially the same: skip this sample and will later on force no call.
|
||||
//sampleIdx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
sampleIndices.add(sample.getKey());
|
||||
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||
|
|
@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
Genotype g = GLs.get(sample);
|
||||
if ( !g.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
|
||||
// and will add no-call genotype to GL's in a second pass
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Double.NEGATIVE_INFINITY;
|
||||
double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
||||
}
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];;
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
/* System.out.format("Sample: %s GL:",sample);
|
||||
for (int i=0; i < likelihoods.length; i++)
|
||||
System.out.format("%1.4f ",likelihoods[i]);
|
||||
System.out.format("%1.4f, ",likelihoods[i]);
|
||||
*/
|
||||
|
||||
for (int i=0; i < likelihoods.length; i++) {
|
||||
|
|
@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
continue;
|
||||
Genotype g = GLs.get(sample.getKey());
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
||||
continue; // regular likelihoods
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Genotype.NO_NEG_LOG_10PERROR;
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
return calls;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
|
|
@ -321,7 +323,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
haplotypeMap.clear();
|
||||
|
||||
if (getAlleleListFromVCF) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles) ) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
|
|
@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (pileup != null ) {
|
||||
double[] genotypeLikelihoods;
|
||||
|
||||
if (useOldWrongHorribleHackedUpLikelihoodModel)
|
||||
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
|
||||
else
|
||||
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
|
||||
|
||||
|
||||
|
||||
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
|
||||
// for 3 alleles it's 00 01 11 02 12 22
|
||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
alleleList,
|
||||
genotypeLikelihoods,
|
||||
getFilteredDepth(pileup)));
|
||||
|
|
@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
return indelLikelihoodMap.get();
|
||||
}
|
||||
|
||||
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
|
||||
// so that per-sample DP will include deletions covering the event.
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -26,14 +26,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -58,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
}
|
||||
|
||||
public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
if ( tracker == null || ref == null || logger == null )
|
||||
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
|
||||
VariantContext vc = null;
|
||||
|
||||
// search for usable record
|
||||
for( final VariantContext vc_input : tracker.getValues(allelesBinding) ) {
|
||||
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
|
||||
if ( vc == null ) {
|
||||
vc = vc_input;
|
||||
} else {
|
||||
logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
|
|
@ -96,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
if ( alternateAlleleToUse != null ) {
|
||||
bestAlternateAllele = alternateAlleleToUse.getBases()[0];
|
||||
} else if ( useAlleleFromVCF ) {
|
||||
VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger, UAC.alleles);
|
||||
VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
|
|
|||
|
|
@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -127,7 +126,8 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
|
||||
|
|
@ -210,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
|
||||
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this);
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this, getToolkit());
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
|
||||
|
||||
// initialize the header
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
|
|
@ -36,13 +37,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -236,10 +235,11 @@ public class UnifiedGenotyperEngine {
|
|||
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
|
||||
VariantContext vc;
|
||||
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles);
|
||||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return null;
|
||||
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles());
|
||||
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase());
|
||||
|
||||
} else {
|
||||
// deal with bad/non-standard reference bases
|
||||
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
||||
|
|
@ -635,7 +635,7 @@ public class UnifiedGenotyperEngine {
|
|||
// no extended event pileup
|
||||
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
|
||||
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
|
||||
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger, UAC.alleles);
|
||||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if (vcInput == null)
|
||||
return null;
|
||||
|
||||
|
|
@ -741,4 +741,23 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
return afcm;
|
||||
}
|
||||
|
||||
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
if ( tracker == null || ref == null || logger == null )
|
||||
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
|
||||
VariantContext vc = null;
|
||||
|
||||
// search for usable record
|
||||
for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) {
|
||||
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
|
||||
if ( vc == null ) {
|
||||
vc = vc_input;
|
||||
} else {
|
||||
logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel {
|
|||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||
double baseProb = QualityUtils.qualToProb(k);
|
||||
double baseProb = QualityUtils.qualToProb((byte)k);
|
||||
|
||||
|
||||
baseMatchArray[k] = probToQual(baseProb);
|
||||
|
|
|
|||
|
|
@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
|
||||
* but this latter functionality is now superceded by UnifiedGenotyper.
|
||||
*
|
||||
* <p>
|
||||
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
|
||||
* data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
|
||||
* and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
|
||||
* or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
|
||||
* the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
|
||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
|
||||
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
|
||||
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
|
||||
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
|
||||
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
|
||||
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
|
||||
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
|
||||
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
|
||||
* metrics for the post-processing tools to make the final decision). The calls are performed by default
|
||||
* from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
|
||||
* command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
|
||||
* respectively. Indels are called from the tumor sample and annotated as germline
|
||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
|
||||
* if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
|
||||
* is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
|
||||
* in tumor sample, it will not be reported.
|
||||
*
|
||||
* <b>If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
|
||||
* please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.</b>
|
||||
* To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
|
||||
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
|
||||
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Indel calls with associated metrics.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SomaticIndelDetector \
|
||||
* -o indels.vcf \
|
||||
* -verbose indels.txt
|
||||
* -I:normal normal.bam \
|
||||
* -I:tumor tumor.bam
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
|
||||
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||
// @Output
|
||||
// PrintStream out;
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
@Output(doc="File to write variants (indels) in VCF format",required=true)
|
||||
protected VCFWriter vcf_writer = null;
|
||||
|
||||
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
|
||||
|
|
@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
@Hidden
|
||||
@Argument(fullName = "genotype_intervals", shortName = "genotype",
|
||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
|
||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
|
||||
public String genotypeIntervalsFile = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
|
||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||
"to sort and keep it in memory (increases memory usage!).")
|
||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||
"to sort and keep it in memory (increases memory usage!).")
|
||||
protected boolean GENOTYPE_NOT_SORTED = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="unpaired", shortName="unpaired",
|
||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||
@Argument(fullName="unpaired", shortName="unpaired",
|
||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||
boolean call_unpaired = false;
|
||||
boolean call_somatic ;
|
||||
boolean call_somatic ;
|
||||
|
||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||
doc="Verbose output file in text format", required=false)
|
||||
java.io.File verboseOutput = null;
|
||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||
doc="Verbose output file in text format", required=false)
|
||||
java.io.File verboseOutput = null;
|
||||
|
||||
@Argument(fullName="bedOutput", shortName="bed",
|
||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
java.io.File bedOutput = null;
|
||||
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
|
||||
int minCoverage = 6;
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
|
||||
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
|
||||
int minCoverage = 6;
|
||||
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
|
||||
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
double minFraction = 0.3;
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
double minFraction = 0.3;
|
||||
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
|
||||
"all indel observations at the site exceeds this threshold", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
int minIndelCount = 0;
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
int minIndelCount = 0;
|
||||
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
|
||||
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
|
||||
@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
"by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
PlatformUnitFilterHelper dummy;
|
||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
|
||||
//@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
//PlatformUnitFilterHelper dummy;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
|
||||
required=false) Boolean DEBUG = false;
|
||||
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
|
||||
"May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
|
||||
"May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
|
||||
"window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
|
||||
"than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
|
||||
"if window is too small",required=false) int WINDOW_SIZE = 200;
|
||||
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
|
||||
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
|
||||
|
||||
|
||||
|
||||
private WindowContext tumor_context;
|
||||
private WindowContext normal_context;
|
||||
private int currentContigIndex = -1;
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray {
|
|||
public PhasingRead(int length, int mappingQual) {
|
||||
super(length);
|
||||
|
||||
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
|
||||
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual));
|
||||
|
||||
this.baseProbs = new PreciseNonNegativeDouble[length];
|
||||
Arrays.fill(this.baseProbs, null);
|
||||
|
|
|
|||
|
|
@ -44,12 +44,12 @@ public class RefSeqDataParser {
|
|||
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
|
||||
|
||||
Map<String, String> entriesToNames = new HashMap<String, String>();
|
||||
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
|
||||
if (numRecords != null) {
|
||||
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
|
||||
if (numRecords != -1) {
|
||||
boolean done = false;
|
||||
|
||||
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
|
||||
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
|
||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
done = true;
|
||||
|
|
@ -59,14 +59,14 @@ public class RefSeqDataParser {
|
|||
if (!done) {
|
||||
for (int i = 1; i <= numRecords; i++) {
|
||||
String key = nameKeyToUseMultiplePrefix + i;
|
||||
String name = vc.getAttributeAsStringNoException(key);
|
||||
String name = vc.getAttributeAsString(key, null);
|
||||
if (name != null)
|
||||
entriesToNames.put(key, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
else { // no entry with the # of records:
|
||||
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
|
||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -76,6 +76,42 @@ import java.util.Map;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
*
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* * The file begins with a series of comment lines describing:
|
||||
* ** The number of counted loci
|
||||
* ** The number of counted bases
|
||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*
|
||||
* <pre>
|
||||
* # Counted Sites 19451059
|
||||
* # Counted Bases 56582018
|
||||
* # Skipped Sites 82666
|
||||
* # Fraction Skipped 1 / 235 bp
|
||||
* ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
|
||||
* SRR006446,11,65,CA,9,1,10
|
||||
* SRR006446,11,48,TA,10,0,40
|
||||
* SRR006446,11,67,AA,27,0,40
|
||||
* SRR006446,11,61,GA,11,1,10
|
||||
* SRR006446,12,34,CA,47,1,17
|
||||
* SRR006446,12,30,GA,52,1,17
|
||||
* SRR006446,12,36,AA,352,1,25
|
||||
* SRR006446,12,17,TA,182,11,12
|
||||
* SRR006446,11,48,TG,2,0,40
|
||||
* SRR006446,11,67,AG,1,0,40
|
||||
* SRR006446,12,34,CG,9,0,40
|
||||
* SRR006446,12,30,GG,43,0,40
|
||||
* ERR001876,4,31,AG,1,0,40
|
||||
* ERR001876,4,31,AT,2,2,1
|
||||
* ERR001876,4,31,CA,1,0,40
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ import java.util.List;
|
|||
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
|
||||
*</pre>
|
||||
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
|
||||
*
|
||||
*<pre>
|
||||
* Valid // amplicon is valid
|
||||
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
|
||||
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
|
||||
|
|
@ -72,10 +72,10 @@ import java.util.List;
|
|||
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
|
||||
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
|
||||
* </p>
|
||||
* </pre></p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre></pre>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ValidationAmplicons
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche;
|
||||
|
|
@ -24,6 +25,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
|
@ -53,7 +55,23 @@ import java.util.*;
|
|||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Evaluation tables.
|
||||
* Evaluation tables detailing the results of the eval modules which were applied.
|
||||
* For example:
|
||||
* <pre>
|
||||
* output.eval.gatkreport:
|
||||
* ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
|
||||
* CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
|
||||
* CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
|
||||
* CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
|
||||
* CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
|
||||
* CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
|
||||
* CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
|
||||
* CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
|
||||
* CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
|
||||
* CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
|
||||
* CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
|
||||
* ...
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
@ -147,12 +165,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
|
||||
|
||||
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false)
|
||||
private String TRANCHE_FILENAME = null;
|
||||
|
||||
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
|
||||
private File ancestralAlignmentsFile = null;
|
||||
|
||||
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
|
||||
private boolean requireStrictAlleleMatch = false;
|
||||
|
||||
// Variables
|
||||
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
|
||||
|
||||
|
|
@ -230,16 +248,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
jexlExpressions.add(sjexl);
|
||||
}
|
||||
|
||||
// Add select expressions for anything in the tranches file
|
||||
if ( TRANCHE_FILENAME != null ) {
|
||||
// we are going to build a few select names automatically from the tranches file
|
||||
for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) {
|
||||
logger.info("Adding select for all variant above the pCut of : " + t);
|
||||
SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod));
|
||||
SELECT_NAMES.add(String.format("TS-%.2f", t.ts));
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the set of stratifications and evaluations to use
|
||||
stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
|
||||
Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
|
||||
|
|
@ -371,16 +379,16 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
if ( matchingComps.size() == 0 )
|
||||
return null;
|
||||
|
||||
// find the comp which matches the alternate allele from eval
|
||||
// find the comp which matches both the reference allele and alternate allele from eval
|
||||
Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
|
||||
for ( VariantContext comp : matchingComps ) {
|
||||
Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
|
||||
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) )
|
||||
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) )
|
||||
return comp;
|
||||
}
|
||||
|
||||
// if none match, just return the first one
|
||||
return matchingComps.get(0);
|
||||
// if none match, just return the first one unless we require a strict match
|
||||
return (requireStrictAlleleMatch ? null : matchingComps.get(0));
|
||||
}
|
||||
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }
|
||||
|
|
|
|||
|
|
@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
|||
@DataPoint(description = "number of eval SNP sites")
|
||||
long nEvalVariants = 0;
|
||||
|
||||
@DataPoint(description = "number of comp SNP sites")
|
||||
long nCompVariants = 0;
|
||||
|
||||
@DataPoint(description = "number of eval sites outside of comp sites")
|
||||
long novelSites = 0;
|
||||
|
||||
|
|
@ -75,10 +72,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
|||
}
|
||||
|
||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
boolean evalIsGood = eval != null && eval.isVariant();
|
||||
boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType());
|
||||
boolean evalIsGood = eval != null && eval.isPolymorphic();
|
||||
boolean compIsGood = comp != null && comp.isNotFiltered();
|
||||
|
||||
if (compIsGood) nCompVariants++; // count the number of comp events
|
||||
if (evalIsGood) nEvalVariants++; // count the number of eval events
|
||||
|
||||
if (compIsGood && evalIsGood) {
|
||||
|
|
|
|||
|
|
@ -100,21 +100,22 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
|
||||
// added in a proxy check for monomorphic status here.
|
||||
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
|
||||
if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) {
|
||||
if ( vc1.isMonomorphic() ) {
|
||||
nRefLoci++;
|
||||
} else {
|
||||
switch (vc1.getType()) {
|
||||
case NO_VARIATION:
|
||||
// shouldn't get here
|
||||
break;
|
||||
case SNP:
|
||||
nVariantLoci++;
|
||||
nSNPs++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
|
||||
break;
|
||||
case MNP:
|
||||
nVariantLoci++;
|
||||
nMNPs++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
|
||||
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
|
||||
break;
|
||||
case INDEL:
|
||||
nVariantLoci++;
|
||||
|
|
@ -136,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
|
||||
String refStr = vc1.getReference().getBaseString().toUpperCase();
|
||||
|
||||
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null;
|
||||
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null;
|
||||
// if (aaStr.equals(".")) {
|
||||
// aaStr = refStr;
|
||||
// }
|
||||
|
|
|
|||
|
|
@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
|||
}
|
||||
|
||||
public static Double getPQ(Genotype gt) {
|
||||
return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
|
||||
Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
|
||||
return d == -1 ? null : d;
|
||||
}
|
||||
|
||||
public static boolean topMatchesTop(AllelePair b1, AllelePair b2) {
|
||||
|
|
|
|||
|
|
@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator {
|
|||
public int getComparisonOrder() { return 1; } // need only the evals
|
||||
|
||||
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( ! vc1.isBiallelic() && vc1.isIndel() ) {
|
||||
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
|
||||
return vc1.toString(); // biallelic sites are output
|
||||
}
|
||||
|
||||
if ( vc1.isIndel() ) {
|
||||
if ( vc1.isIndel() && vc1.isPolymorphic() ) {
|
||||
|
||||
if ( ! vc1.isBiallelic() ) {
|
||||
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
|
||||
return vc1.toString(); // biallelic sites are output
|
||||
}
|
||||
|
||||
// only count simple insertions/deletions, not complex indels
|
||||
if ( vc1.isSimpleInsertion() ) {
|
||||
indelHistogram.update(vc1.getAlternateAllele(0).length());
|
||||
} else if ( vc1.isSimpleDeletion() ) {
|
||||
indelHistogram.update(-vc1.getReference().length());
|
||||
} else {
|
||||
throw new ReviewedStingException("Indel type that is not insertion or deletion.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -270,7 +270,7 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
|
||||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
if (eval != null ) {
|
||||
if (eval != null && eval.isPolymorphic()) {
|
||||
if ( indelStats == null ) {
|
||||
indelStats = new IndelStats(eval);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
|
|||
if ( eval.hasGenotypes() )
|
||||
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
|
||||
else if ( eval.hasAttribute("AC") ) {
|
||||
ac = Integer.valueOf(eval.getAttributeAsString("AC"));
|
||||
ac = eval.getAttributeAsInt("AC", -1);
|
||||
}
|
||||
|
||||
if ( ac != -1 ) {
|
||||
|
|
@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
|
|||
}
|
||||
}
|
||||
|
||||
if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) {
|
||||
if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) {
|
||||
metrics.incrValue(eval);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
|
|||
}
|
||||
|
||||
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) {
|
||||
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) {
|
||||
return null; //no interesting sites
|
||||
}
|
||||
|
||||
if (vc.hasGenotypes()) {
|
||||
//this maps allele to a count
|
||||
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
|
||||
|
||||
//this maps allele to a count
|
||||
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
|
||||
int numHetsHere = 0;
|
||||
float numGenosHere = 0;
|
||||
int numIndsHere = 0;
|
||||
|
||||
int numHetsHere = 0;
|
||||
float numGenosHere = 0;
|
||||
int numIndsHere = 0;
|
||||
for (Genotype genotype : vc.getGenotypes().values()) {
|
||||
numIndsHere++;
|
||||
if (!genotype.isNoCall()) {
|
||||
//increment stats for heterozygosity
|
||||
if (genotype.isHet()) {
|
||||
numHetsHere++;
|
||||
}
|
||||
|
||||
for (Genotype genotype : vc.getGenotypes().values()) {
|
||||
numIndsHere++;
|
||||
if (!genotype.isNoCall()) {
|
||||
//increment stats for heterozygosity
|
||||
if (genotype.isHet()) {
|
||||
numHetsHere++;
|
||||
}
|
||||
numGenosHere++;
|
||||
//increment stats for pairwise mismatches
|
||||
|
||||
numGenosHere++;
|
||||
//increment stats for pairwise mismatches
|
||||
|
||||
for (Allele allele : genotype.getAlleles()) {
|
||||
if (allele.isNonNull() && allele.isCalled()) {
|
||||
String alleleString = allele.toString();
|
||||
alleleCounts.putIfAbsent(alleleString, 0);
|
||||
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
|
||||
}
|
||||
for (Allele allele : genotype.getAlleles()) {
|
||||
if (allele.isNonNull() && allele.isCalled()) {
|
||||
String alleleString = allele.toString();
|
||||
alleleCounts.putIfAbsent(alleleString, 0);
|
||||
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numGenosHere > 0) {
|
||||
//only if have one called genotype at least
|
||||
this.numSites++;
|
||||
}
|
||||
if (numGenosHere > 0) {
|
||||
//only if have one called genotype at least
|
||||
this.numSites++;
|
||||
|
||||
this.totalHet += numHetsHere / numGenosHere;
|
||||
this.totalHet += numHetsHere / numGenosHere;
|
||||
|
||||
//compute based on num sites
|
||||
float harmonicFactor = 0;
|
||||
for (int i = 1; i <= numIndsHere; i++) {
|
||||
harmonicFactor += 1.0 / i;
|
||||
}
|
||||
this.thetaRegionNumSites += 1.0 / harmonicFactor;
|
||||
//compute based on num sites
|
||||
float harmonicFactor = 0;
|
||||
for (int i = 1; i <= numIndsHere; i++) {
|
||||
harmonicFactor += 1.0 / i;
|
||||
}
|
||||
this.thetaRegionNumSites += 1.0 / harmonicFactor;
|
||||
|
||||
//now compute pairwise mismatches
|
||||
float numPairwise = 0;
|
||||
float numDiffs = 0;
|
||||
for (String allele1 : alleleCounts.keySet()) {
|
||||
int allele1Count = alleleCounts.get(allele1);
|
||||
//now compute pairwise mismatches
|
||||
float numPairwise = 0;
|
||||
float numDiffs = 0;
|
||||
for (String allele1 : alleleCounts.keySet()) {
|
||||
int allele1Count = alleleCounts.get(allele1);
|
||||
|
||||
for (String allele2 : alleleCounts.keySet()) {
|
||||
if (allele1.compareTo(allele2) < 0) {
|
||||
continue;
|
||||
}
|
||||
if (allele1 .compareTo(allele2) == 0) {
|
||||
numPairwise += allele1Count * (allele1Count - 1) * .5;
|
||||
for (String allele2 : alleleCounts.keySet()) {
|
||||
if (allele1.compareTo(allele2) < 0) {
|
||||
continue;
|
||||
}
|
||||
if (allele1 .compareTo(allele2) == 0) {
|
||||
numPairwise += allele1Count * (allele1Count - 1) * .5;
|
||||
|
||||
}
|
||||
else {
|
||||
int allele2Count = alleleCounts.get(allele2);
|
||||
numPairwise += allele1Count * allele2Count;
|
||||
numDiffs += allele1Count * allele2Count;
|
||||
}
|
||||
}
|
||||
else {
|
||||
int allele2Count = alleleCounts.get(allele2);
|
||||
numPairwise += allele1Count * allele2Count;
|
||||
numDiffs += allele1Count * allele2Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numPairwise > 0) {
|
||||
this.totalAvgDiffs += numDiffs / numPairwise;
|
||||
}
|
||||
if (numPairwise > 0) {
|
||||
this.totalAvgDiffs += numDiffs / numPairwise;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
|
|||
}
|
||||
|
||||
public void updateTiTv(VariantContext vc, boolean updateStandard) {
|
||||
if (vc != null && vc.isSNP() && vc.isBiallelic()) {
|
||||
if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) {
|
||||
if (VariantContextUtils.isTransition(vc)) {
|
||||
if (updateStandard) nTiInComp++;
|
||||
else nTi++;
|
||||
|
|
@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
|
|||
else nTv++;
|
||||
}
|
||||
|
||||
String refStr = vc.getReference().getBaseString().toUpperCase();
|
||||
String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase();
|
||||
|
||||
if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) {
|
||||
BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]);
|
||||
|
||||
//System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType);
|
||||
|
||||
if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) {
|
||||
nTiDerived++;
|
||||
} else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) {
|
||||
nTvDerived++;
|
||||
if (vc.hasAttribute("ANCESTRALALLELE")) {
|
||||
final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase();
|
||||
if ( ! aaStr.equals(".") ) {
|
||||
switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) {
|
||||
case TRANSITION: nTiDerived++; break;
|
||||
case TRANSVERSION: nTvDerived++; break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
|
|||
public SiteStatus calcSiteStatus(VariantContext vc) {
|
||||
if ( vc == null ) return SiteStatus.NO_CALL;
|
||||
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
|
||||
if ( ! vc.isVariant() ) return SiteStatus.MONO;
|
||||
if ( vc.isMonomorphic() ) return SiteStatus.MONO;
|
||||
if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes
|
||||
|
||||
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
|
||||
int ac = 0;
|
||||
|
|
@ -130,10 +131,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
|
|||
//// System.out.printf(" ac = %d%n", ac);
|
||||
}
|
||||
else
|
||||
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
|
||||
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
|
||||
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
|
||||
} else if ( vc.hasGenotypes() ) {
|
||||
return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO;
|
||||
} else {
|
||||
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
|
||||
//return SiteStatus.NO_CALL; // we can't figure out what to do
|
||||
|
|
|
|||
|
|
@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator {
|
|||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
final String interesting = null;
|
||||
|
||||
if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||
if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||
if( titvStats == null ) { titvStats = new TiTvStats(); }
|
||||
titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier {
|
|||
if (eval != null) {
|
||||
int AC = -1;
|
||||
if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) {
|
||||
AC = eval.getAttributeAsInt("AC");
|
||||
AC = eval.getAttributeAsInt("AC", 0);
|
||||
} else if ( eval.isVariant() ) {
|
||||
for (Allele allele : eval.getAlternateAlleles())
|
||||
AC = Math.max(AC, eval.getChromosomeCount(allele));
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier {
|
|||
|
||||
if (eval != null) {
|
||||
try {
|
||||
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3))));
|
||||
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3))));
|
||||
} catch (Exception e) {
|
||||
return relevantStates;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier {
|
|||
Integer frame = null;
|
||||
|
||||
if (eval.hasAttribute("refseq.functionalClass")) {
|
||||
aa = eval.getAttributeAsString("refseq.variantAA");
|
||||
frame = eval.getAttributeAsInt("refseq.frame");
|
||||
aa = eval.getAttributeAsString("refseq.variantAA", null);
|
||||
frame = eval.getAttributeAsInt("refseq.frame", 0);
|
||||
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
|
||||
int annotationId = 1;
|
||||
String key;
|
||||
|
|
@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier {
|
|||
do {
|
||||
key = String.format("refseq.functionalClass_%d", annotationId);
|
||||
|
||||
String newtype = eval.getAttributeAsString(key);
|
||||
String newtype = eval.getAttributeAsString(key, null);
|
||||
|
||||
if ( newtype != null &&
|
||||
( type == null ||
|
||||
|
|
@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier {
|
|||
type = newtype;
|
||||
|
||||
String aakey = String.format("refseq.variantAA_%d", annotationId);
|
||||
aa = eval.getAttributeAsString(aakey);
|
||||
aa = eval.getAttributeAsString(aakey, null);
|
||||
|
||||
if (aa != null) {
|
||||
String framekey = String.format("refseq.frame_%d", annotationId);
|
||||
|
||||
if (eval.hasAttribute(framekey)) {
|
||||
frame = eval.getAttributeAsInt(framekey);
|
||||
frame = eval.getAttributeAsInt(framekey, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -11,25 +12,34 @@ import java.util.List;
|
|||
* Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
|
||||
*/
|
||||
public class FunctionalClass extends VariantStratifier {
|
||||
@Override
|
||||
public void initialize() {
|
||||
states.add("all");
|
||||
states.add("silent");
|
||||
states.add("missense");
|
||||
states.add("nonsense");
|
||||
|
||||
public enum FunctionalType {
|
||||
silent,
|
||||
missense,
|
||||
nonsense
|
||||
}
|
||||
|
||||
|
||||
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
|
||||
@Override
|
||||
public void initialize() {
|
||||
states.add("all");
|
||||
for ( FunctionalType type : FunctionalType.values() )
|
||||
states.add(type.name());
|
||||
}
|
||||
|
||||
|
||||
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
|
||||
ArrayList<String> relevantStates = new ArrayList<String>();
|
||||
|
||||
relevantStates.add("all");
|
||||
|
||||
if (eval != null && eval.isVariant()) {
|
||||
String type = null;
|
||||
FunctionalType type = null;
|
||||
|
||||
if (eval.hasAttribute("refseq.functionalClass")) {
|
||||
type = eval.getAttributeAsString("refseq.functionalClass");
|
||||
try {
|
||||
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null));
|
||||
} catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
|
||||
int annotationId = 1;
|
||||
String key;
|
||||
|
|
@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier {
|
|||
do {
|
||||
key = String.format("refseq.functionalClass_%d", annotationId);
|
||||
|
||||
String newtype = eval.getAttributeAsString(key);
|
||||
|
||||
if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
|
||||
( type == null ||
|
||||
( type.equals("silent") && !newtype.equals("silent") ) ||
|
||||
( type.equals("missense") && newtype.equals("nonsense") ) )
|
||||
) {
|
||||
type = newtype;
|
||||
String newtypeStr = eval.getAttributeAsString(key, null);
|
||||
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
|
||||
try {
|
||||
FunctionalType newType = FunctionalType.valueOf(newtypeStr);
|
||||
if ( type == null ||
|
||||
( type == FunctionalType.silent && newType != FunctionalType.silent ) ||
|
||||
( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) {
|
||||
type = newType;
|
||||
}
|
||||
} catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
}
|
||||
|
||||
annotationId++;
|
||||
} while (eval.hasAttribute(key));
|
||||
|
||||
} else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) {
|
||||
SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString());
|
||||
if ( snpEffType == SnpEff.EffectType.STOP_GAINED )
|
||||
type = FunctionalType.nonsense;
|
||||
else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING )
|
||||
type = FunctionalType.missense;
|
||||
else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING )
|
||||
type = FunctionalType.silent;
|
||||
}
|
||||
|
||||
if (type != null) {
|
||||
if (type.equals("silent")) { relevantStates.add("silent"); }
|
||||
else if (type.equals("missense")) { relevantStates.add("missense"); }
|
||||
else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
|
||||
if ( type != null ) {
|
||||
relevantStates.add(type.name());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -277,7 +277,7 @@ public class VariantEvalUtils {
|
|||
* @return a new VariantContext with just the requested samples
|
||||
*/
|
||||
public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection<String> sampleNames) {
|
||||
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values());
|
||||
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles());
|
||||
|
||||
HashMap<String, Object> newAts = new HashMap<String, Object>(vcsub.getAttributes());
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/12/11
|
||||
*/
|
||||
|
||||
public class TrainingSet {
|
||||
|
||||
public RodBinding<VariantContext> rodBinding;
|
||||
public boolean isKnown = false;
|
||||
public boolean isTraining = false;
|
||||
public boolean isAntiTraining = false;
|
||||
public boolean isTruth = false;
|
||||
public boolean isConsensus = false;
|
||||
public double prior = 0.0;
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(TrainingSet.class);
|
||||
|
||||
public TrainingSet( final RodBinding<VariantContext> rodBinding) {
|
||||
this.rodBinding = rodBinding;
|
||||
|
||||
final Tags tags = rodBinding.getTags();
|
||||
final String name = rodBinding.getName();
|
||||
|
||||
// Parse the tags to decide which tracks have which properties
|
||||
if( tags != null ) {
|
||||
isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
|
||||
isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
|
||||
isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
|
||||
isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
|
||||
isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
|
||||
prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
|
||||
}
|
||||
|
||||
// Report back to the user which tracks were found and the properties that were detected
|
||||
if( !isConsensus && !isAntiTraining ) {
|
||||
logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
|
||||
} else if( isConsensus ) {
|
||||
logger.info( String.format( "Found consensus track: %s", name) );
|
||||
} else {
|
||||
logger.info( String.format( "Found bad sites training track: %s", name) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -115,7 +115,7 @@ public class VQSRCalibrationCurve {
|
|||
if ( vc.isFiltered() )
|
||||
return 0.0;
|
||||
else if ( vc.hasAttribute(VQSRQualKey) ) {
|
||||
double qual = vc.getAttributeAsDouble(VQSRQualKey);
|
||||
double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0);
|
||||
return probTrueVariant(qual);
|
||||
} else {
|
||||
throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
|
||||
|
|
@ -143,7 +143,7 @@ public class VQSRCalibrationCurve {
|
|||
for ( int i = 0; i < log10Likelihoods.length; i++) {
|
||||
double p = Math.pow(10, log10Likelihoods[i]);
|
||||
double q = alpha * p + (1-alpha) * noInfoPr;
|
||||
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q);
|
||||
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q);
|
||||
updated[i] = Math.log10(q);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,10 +51,10 @@ public class VariantDataManager {
|
|||
private ExpandingArrayList<VariantDatum> data;
|
||||
private final double[] meanVector;
|
||||
private final double[] varianceVector; // this is really the standard deviation
|
||||
public final ArrayList<String> annotationKeys;
|
||||
public final List<String> annotationKeys;
|
||||
private final VariantRecalibratorArgumentCollection VRAC;
|
||||
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
||||
|
||||
protected final List<TrainingSet> trainingSets;
|
||||
|
||||
public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
|
||||
this.data = null;
|
||||
|
|
@ -62,6 +62,7 @@ public class VariantDataManager {
|
|||
this.VRAC = VRAC;
|
||||
meanVector = new double[this.annotationKeys.size()];
|
||||
varianceVector = new double[this.annotationKeys.size()];
|
||||
trainingSets = new ArrayList<TrainingSet>();
|
||||
}
|
||||
|
||||
public void setData( final ExpandingArrayList<VariantDatum> data ) {
|
||||
|
|
@ -104,6 +105,31 @@ public class VariantDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
public void addTrainingSet( final TrainingSet trainingSet ) {
|
||||
trainingSets.add( trainingSet );
|
||||
}
|
||||
|
||||
public boolean checkHasTrainingSet() {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
if( trainingSet.isTraining ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean checkHasTruthSet() {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
if( trainingSet.isTruth ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean checkHasKnownSet() {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
if( trainingSet.isKnown ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> getTrainingData() {
|
||||
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
|
||||
for( final VariantDatum datum : data ) {
|
||||
|
|
@ -232,57 +258,35 @@ public class VariantDataManager {
|
|||
return value;
|
||||
}
|
||||
|
||||
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap<String, Double> rodToPriorMap,
|
||||
final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites, final List<RodBinding<VariantContext>> resource) {
|
||||
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) {
|
||||
datum.isKnown = false;
|
||||
datum.atTruthSite = false;
|
||||
datum.atTrainingSite = false;
|
||||
datum.atAntiTrainingSite = false;
|
||||
datum.prior = 2.0;
|
||||
|
||||
//BUGBUG: need to clean this up
|
||||
|
||||
for( final RodBinding<VariantContext> rod : training ) {
|
||||
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) {
|
||||
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
|
||||
datum.atTrainingSite = true;
|
||||
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
|
||||
datum.isKnown = datum.isKnown || trainingSet.isKnown;
|
||||
datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
|
||||
datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
|
||||
datum.prior = Math.max( datum.prior, trainingSet.prior );
|
||||
datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
for( final RodBinding<VariantContext> rod : truth ) {
|
||||
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
|
||||
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
|
||||
datum.atTruthSite = true;
|
||||
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
|
||||
}
|
||||
}
|
||||
}
|
||||
for( final RodBinding<VariantContext> rod : known ) {
|
||||
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
|
||||
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
|
||||
datum.isKnown = true;
|
||||
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
|
||||
}
|
||||
}
|
||||
}
|
||||
for( final RodBinding<VariantContext> rod : resource ) {
|
||||
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
|
||||
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
|
||||
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
|
||||
}
|
||||
}
|
||||
}
|
||||
for( final RodBinding<VariantContext> rod : badSites ) {
|
||||
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
|
||||
if( trainVC != null ) {
|
||||
datum.atAntiTrainingSite = true;
|
||||
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
|
||||
datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
|
||||
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
|
||||
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
|
||||
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
|
||||
}
|
||||
|
||||
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
|
||||
|
|
@ -290,10 +294,4 @@ public class VariantDataManager {
|
|||
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
|
||||
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
|
||||
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
|
||||
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -77,16 +77,15 @@ import java.util.*;
|
|||
* <p>
|
||||
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <h2>Example</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -T VariantRecalibrator \
|
||||
* -R reference/human_g1k_v37.fasta \
|
||||
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
|
||||
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
|
||||
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
|
||||
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
|
||||
* -known:prior=8.0 dbsnp_132.b37.vcf \
|
||||
* -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
|
||||
* -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
|
||||
* -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
|
||||
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
|
||||
* -recalFile path/to/output.recal \
|
||||
* -tranchesFile path/to/output.tranches \
|
||||
|
|
@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
public List<RodBinding<VariantContext>> input;
|
||||
|
||||
/**
|
||||
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
|
||||
*/
|
||||
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
|
||||
public List<RodBinding<VariantContext>> training;
|
||||
|
||||
/**
|
||||
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
|
||||
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
|
||||
*/
|
||||
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
|
||||
public List<RodBinding<VariantContext>> truth;
|
||||
|
||||
/**
|
||||
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
|
||||
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
|
||||
*/
|
||||
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
|
||||
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
|
||||
*/
|
||||
@Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
|
||||
public List<RodBinding<VariantContext>> badSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
|
||||
* Any set of VCF files to use as lists of training, truth, or known sites.
|
||||
* Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
|
||||
* Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
|
||||
* Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
|
||||
* Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
|
||||
*/
|
||||
@Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
|
||||
public List<RodBinding<VariantContext>> resource = Collections.emptyList();
|
||||
|
|
@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private PrintStream tranchesStream;
|
||||
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
|
||||
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
|
||||
private final HashMap<String, Double> rodToPriorMap = new HashMap<String, Double>();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
|
||||
}
|
||||
|
||||
final ArrayList<RodBinding<VariantContext>> allInputBindings = new ArrayList<RodBinding<VariantContext>>();
|
||||
allInputBindings.addAll(truth);
|
||||
allInputBindings.addAll(training);
|
||||
allInputBindings.addAll(known);
|
||||
allInputBindings.addAll(badSites);
|
||||
allInputBindings.addAll(resource);
|
||||
for( final RodBinding<VariantContext> rod : allInputBindings ) {
|
||||
try {
|
||||
rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
|
||||
} catch( NumberFormatException e ) {
|
||||
throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
|
||||
}
|
||||
for( RodBinding<VariantContext> rod : resource ) {
|
||||
dataManager.addTrainingSet( new TrainingSet( rod ) );
|
||||
}
|
||||
|
||||
if( !dataManager.checkHasTrainingSet() ) {
|
||||
throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
|
||||
}
|
||||
if( !dataManager.checkHasTruthSet() ) {
|
||||
throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);
|
||||
|
||||
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
|
||||
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites, resource ); // BUGBUG: need to clean this up to be a class, not a list of all the rod bindings
|
||||
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
|
||||
double priorFactor = QualityUtils.qualToProb( datum.prior );
|
||||
//if( PERFORM_PROJECT_CONSENSUS ) { // BUGBUG: need to resurrect this functionality?
|
||||
// final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
|
||||
|
|
|
|||
|
|
@ -234,16 +234,47 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
|
||||
return 0;
|
||||
|
||||
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
|
||||
List<VariantContext> preMergedVCs = new ArrayList<VariantContext>();
|
||||
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
|
||||
// iterate over the types so that it's deterministic
|
||||
for ( VariantContext.Type type : VariantContext.Type.values() ) {
|
||||
if ( VCsByType.containsKey(type) )
|
||||
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
|
||||
preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
|
||||
}
|
||||
|
||||
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
|
||||
// se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record,
|
||||
// we will still merge those records.
|
||||
if (preMergedVCs.size() > 1) {
|
||||
for (VariantContext vc1 : preMergedVCs) {
|
||||
VariantContext newvc = vc1;
|
||||
boolean merged = false;
|
||||
for (int k=0; k < mergedVCs.size(); k++) {
|
||||
VariantContext vc2 = mergedVCs.get(k);
|
||||
|
||||
if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) {
|
||||
// all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2
|
||||
List<VariantContext> vcpair = new ArrayList<VariantContext>();
|
||||
vcpair.add(vc1);
|
||||
vcpair.add(vc2);
|
||||
newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair,
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC);
|
||||
mergedVCs.set(k,newvc);
|
||||
merged = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!merged)
|
||||
mergedVCs.add(vc1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
mergedVCs = preMergedVCs;
|
||||
}
|
||||
|
||||
for ( VariantContext mergedVC : mergedVCs ) {
|
||||
// only operate at the start of events
|
||||
if ( mergedVC == null )
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
|
||||
writer = new StandardVCFWriter(file, false);
|
||||
writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false);
|
||||
writer.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ public class RandomlySplitVariants extends RodWalker<Integer, Integer> {
|
|||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
|
||||
vcfWriter1.writeHeader(new VCFHeader(hInfo, samples));
|
||||
vcfWriter2 = new StandardVCFWriter(file2, true);
|
||||
vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true);
|
||||
vcfWriter2.writeHeader(new VCFHeader(hInfo, samples));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -145,10 +145,9 @@ import java.util.*;
|
|||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -SM family.yaml \
|
||||
* -family NA12891+NA12892=NA12878 \
|
||||
* -mvq 50
|
||||
* -mvq 50 \
|
||||
* -o violations.vcf
|
||||
*
|
||||
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
|
|
@ -265,17 +264,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
private File AF_FILE = new File("");
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
@Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false)
|
||||
private File FAMILY_STRUCTURE_FILE = null;
|
||||
|
||||
/**
|
||||
* String formatted as dad+mom=child where these parameters determine which sample names are examined.
|
||||
*/
|
||||
@Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
|
||||
@Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
private String FAMILY_STRUCTURE = "";
|
||||
|
||||
/**
|
||||
* Sample metadata information will be taken from a YAML file (see the -SM argument).
|
||||
* This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
|
||||
*/
|
||||
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
|
||||
private Boolean MENDELIAN_VIOLATIONS = false;
|
||||
|
|
@ -306,7 +305,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
@Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
|
||||
private String outMVFile = null;
|
||||
|
||||
/* Private class used to store the intermediate variants in the integer random selection process */
|
||||
|
|
@ -575,7 +574,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
|
||||
// We then pick a variant with probablity AF*desiredFraction
|
||||
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
|
||||
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
|
||||
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||
|
||||
double af;
|
||||
double afBoost = 1.0;
|
||||
|
|
|
|||
|
|
@ -56,11 +56,6 @@ import java.util.Set;
|
|||
* A variant set to filter.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A filtered VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
|
||||
* Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes)
|
||||
*
|
||||
* <p>
|
||||
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
|
||||
|
|
@ -57,7 +57,16 @@ import java.util.*;
|
|||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* An annotated VCF.
|
||||
* An annotated VCF. Additionally, a table like the following will be output:
|
||||
* <pre>
|
||||
* Total number of samples assayed: 185
|
||||
* Total number of records processed: 152
|
||||
* Number of Hardy-Weinberg violations: 34 (22%)
|
||||
* Number of no-call violations: 12 (7%)
|
||||
* Number of homozygous variant violations: 0 (0%)
|
||||
* Number of records passing all filters: 106 (69%)
|
||||
* Number of passing records that are polymorphic: 98 (92%)
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
|
|||
|
|
@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
if ( getters.containsKey(field) ) {
|
||||
val = getters.get(field).get(vc);
|
||||
} else if ( vc.hasAttribute(field) ) {
|
||||
val = vc.getAttributeAsString(field);
|
||||
val = vc.getAttributeAsString(field, null);
|
||||
} else if ( isWildCard(field) ) {
|
||||
Set<String> wildVals = new HashSet<String>();
|
||||
for ( Map.Entry<String,Object> elt : vc.getAttributes().entrySet()) {
|
||||
|
|
@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
|
||||
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
|
||||
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
|
||||
getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } });
|
||||
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
|
||||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
|
|
|
|||
|
|
@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (int)( start << 16 + stop << 4 + contigIndex );
|
||||
return start << 16 | stop << 4 | contigIndex;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1056,42 +1056,30 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
static public double softMax(final double x, final double y) {
|
||||
if (Double.isInfinite(x))
|
||||
return y;
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
|
||||
if (Double.isInfinite(y))
|
||||
return x;
|
||||
// slow exact version:
|
||||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
|
||||
if (y >= x + MAX_JACOBIAN_TOLERANCE)
|
||||
return y;
|
||||
if (x >= y + MAX_JACOBIAN_TOLERANCE)
|
||||
return x;
|
||||
double diff = x-y;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
|
||||
//double diff = Math.abs(x-y);
|
||||
double diff = x-y;
|
||||
double t1 =x;
|
||||
if (diff<0) { //
|
||||
t1 = y;
|
||||
diff= -diff;
|
||||
}
|
||||
// t has max(x,y), diff has abs(x-y)
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
//int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP);
|
||||
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
|
||||
// gdebug+
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
//gdebug-
|
||||
return t1+jacobianLogTable[ind];
|
||||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
}
|
||||
if (diff > MAX_JACOBIAN_TOLERANCE)
|
||||
return x;
|
||||
else if (diff < -MAX_JACOBIAN_TOLERANCE)
|
||||
return y;
|
||||
else if (diff >= 0) {
|
||||
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
|
||||
return x + jacobianLogTable[ind];
|
||||
}
|
||||
else {
|
||||
int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
|
||||
return y + jacobianLogTable[ind];
|
||||
}
|
||||
}
|
||||
|
||||
public static double phredScaleToProbability (byte q) {
|
||||
return Math.pow(10,(-q)/10.0);
|
||||
|
|
|
|||
|
|
@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils;
|
|||
* @author Kiran Garimella
|
||||
*/
|
||||
public class QualityUtils {
|
||||
|
||||
public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
|
||||
public final static double MIN_REASONABLE_ERROR = 0.0001;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 40;
|
||||
public final static byte MIN_USABLE_Q_SCORE = 6;
|
||||
|
||||
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
|
||||
|
||||
private static double qualToErrorProbCache[] = new double[256];
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Private constructor. No instantiating this class!
|
||||
*/
|
||||
|
|
@ -33,10 +36,6 @@ public class QualityUtils {
|
|||
return 1.0 - qualToErrorProb(qual);
|
||||
}
|
||||
|
||||
static public double qualToProb(int qual) {
|
||||
return qualToProb( (double)qual );
|
||||
}
|
||||
|
||||
static public double qualToProb(double qual) {
|
||||
return 1.0 - Math.pow(10.0, qual/(-10.0));
|
||||
}
|
||||
|
|
@ -48,10 +47,14 @@ public class QualityUtils {
|
|||
* @param qual a quality score (0-40)
|
||||
* @return a probability (0.0-1.0)
|
||||
*/
|
||||
static public double qualToErrorProb(byte qual) {
|
||||
static public double qualToErrorProbRaw(byte qual) {
|
||||
return Math.pow(10.0, ((double) qual)/-10.0);
|
||||
}
|
||||
|
||||
static public double qualToErrorProb(byte qual) {
|
||||
return qualToErrorProbCache[qual];
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a probability to a quality score. Note, this is capped at Q40.
|
||||
*
|
||||
|
|
@ -110,88 +113,4 @@ public class QualityUtils {
|
|||
//return (byte) Math.min(qual, maxQual);
|
||||
return (byte) Math.max(Math.min(qual, maxQual), 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field.
|
||||
* Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis.
|
||||
* Another note: the probability encoded here gets rounded to the nearest 1%.
|
||||
*
|
||||
* @param baseIndex the base index
|
||||
* @param prob the base probability
|
||||
* @return a byte containing the index and the probability
|
||||
*/
|
||||
static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) {
|
||||
byte compressedQual = 0;
|
||||
|
||||
compressedQual = (byte) baseIndex;
|
||||
|
||||
byte cprob = (byte) (100.0*prob);
|
||||
byte qualmask = (byte) 252;
|
||||
compressedQual += ((cprob << 2) & qualmask);
|
||||
|
||||
return compressedQual;
|
||||
}
|
||||
|
||||
/**
|
||||
* From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T)
|
||||
*
|
||||
* @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
|
||||
* @return base index
|
||||
*/
|
||||
static public int compressedQualityToBaseIndex(byte compressedQual) {
|
||||
return (int) (compressedQual & 0x3);
|
||||
}
|
||||
|
||||
/**
|
||||
* From a compressed base, extract the base probability
|
||||
*
|
||||
* @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
|
||||
* @return the probability
|
||||
*/
|
||||
static public double compressedQualityToProb(byte compressedQual) {
|
||||
// Because java natives are signed, extra care must be taken to avoid
|
||||
// shifting a 1 into the sign bit in the implicit promotion of 2 to an int.
|
||||
int x2 = ((int) compressedQual) & 0xff;
|
||||
x2 = (x2 >>> 2);
|
||||
|
||||
return ((double) x2)/100.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement of a compressed quality
|
||||
*
|
||||
* @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality)
|
||||
* @return the complementary compressed quality
|
||||
*/
|
||||
static public byte complementCompressedQuality(byte compressedQual) {
|
||||
int baseIndex = compressedQualityToBaseIndex(compressedQual);
|
||||
double prob = compressedQualityToProb(compressedQual);
|
||||
|
||||
return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the reverse complement of a byte array of compressed qualities
|
||||
*
|
||||
* @param compressedQuals a byte array of compressed quality scores
|
||||
* @return the reverse complement of the byte array
|
||||
*/
|
||||
static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) {
|
||||
byte[] rcCompressedQuals = new byte[compressedQuals.length];
|
||||
|
||||
for (int pos = 0; pos < compressedQuals.length; pos++) {
|
||||
rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]);
|
||||
}
|
||||
|
||||
return rcCompressedQuals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the reverse of a byte array of qualities (compressed or otherwise)
|
||||
* @param quals the array of bytes to be reversed
|
||||
* @return the reverse of the quality array
|
||||
*/
|
||||
static public byte[] reverseQualityArray( byte[] quals ) {
|
||||
return Utils.reverse(quals); // no sense in duplicating functionality
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -190,11 +190,21 @@ public class SampleUtils {
|
|||
|
||||
}
|
||||
|
||||
public static List<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
|
||||
/**
|
||||
* Returns a new set of samples, containing a final list of samples expanded from sampleArgs
|
||||
*
|
||||
* Each element E of sampleArgs can either be a literal sample name or a file. For each E,
|
||||
* we try to read a file named E from disk, and if possible all lines from that file are expanded
|
||||
* into unique sample names.
|
||||
*
|
||||
* @param sampleArgs
|
||||
* @return
|
||||
*/
|
||||
public static Set<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
|
||||
if (sampleArgs != null) {
|
||||
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
|
||||
// sample list set, and treat the entries as if they had been specified on the command line.
|
||||
List<String> samplesFromFiles = new ArrayList<String>();
|
||||
Set<String> samplesFromFiles = new HashSet<String>();
|
||||
for (String SAMPLE_EXPRESSION : sampleArgs) {
|
||||
File sampleFile = new File(SAMPLE_EXPRESSION);
|
||||
|
||||
|
|
@ -203,7 +213,7 @@ public class SampleUtils {
|
|||
|
||||
List<String> lines = reader.readLines();
|
||||
for (String line : lines) {
|
||||
samplesFromFiles.add(line);
|
||||
samplesFromFiles.add(line.trim());
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
|
||||
|
|
@ -212,7 +222,8 @@ public class SampleUtils {
|
|||
|
||||
return samplesFromFiles;
|
||||
}
|
||||
return new ArrayList<String>();
|
||||
|
||||
return new HashSet<String>();
|
||||
}
|
||||
|
||||
public static Set<String> getSamplesFromCommandLineInput(Collection<String> vcfSamples, Collection<String> sampleExpressions) {
|
||||
|
|
|
|||
|
|
@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* TODO FOR CHRIS HARTL
|
||||
* Allows for reading in RefSeq information
|
||||
*
|
||||
* <p>
|
||||
* Codec Description
|
||||
* Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
|
||||
* strandedness of transcription.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* See also: link to file specification
|
||||
* Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here
|
||||
* <a href="http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq">http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq</a>
|
||||
* </p>
|
||||
* <h2> Usage </h2>
|
||||
* The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example
|
||||
* <pre>
|
||||
* -refSeqBinding:REFSEQ /path/to/refSeq.txt
|
||||
* </pre>
|
||||
*
|
||||
* You will need to consult individual walkers for the binding name ("refSeqBinding", above)
|
||||
*
|
||||
* <h2>File format example</h2>
|
||||
* If you want to define your own file for use, the format is (tab delimited):
|
||||
* bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
|
||||
* and exon frames, for example:
|
||||
* <pre>
|
||||
* 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
|
||||
* </pre>
|
||||
* for more information see <a href="http://skip.ucsc.edu/cgi-bin/hgTables?hgsid=5651&hgta_doSchemaDb=mm8&hgta_doSchemaTable=refGene">here</a>
|
||||
* <p>
|
||||
* A BAM file containing <b>exactly one sample</b>.
|
||||
*
|
||||
* </p>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
|
|
|
|||
|
|
@ -1,282 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.snpEff;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec;
|
||||
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Codec for decoding the output format of the SnpEff variant effect predictor tool
|
||||
*
|
||||
* <p>
|
||||
* This format has 23 tab-delimited fields:
|
||||
*
|
||||
* <pre>
|
||||
* Chromosome
|
||||
* Position
|
||||
* Reference
|
||||
* Change
|
||||
* Change Type: {SNP, MNP, INS, DEL}
|
||||
* Zygosity: {Hom, Het}
|
||||
* Quality
|
||||
* Coverage
|
||||
* Warnings
|
||||
* Gene ID
|
||||
* Gene Name
|
||||
* Bio Type
|
||||
* Transcript ID
|
||||
* Exon ID
|
||||
* Exon Rank
|
||||
* Effect
|
||||
* Old/New Amino Acid
|
||||
* Old/New Codon
|
||||
* Codon Num
|
||||
* CDS Size
|
||||
* Codons Around
|
||||
* Amino Acids Around
|
||||
* Custom Interval ID
|
||||
* </pre>
|
||||
* Note that we treat all except the Chromosome, Position, and Effect fields as optional.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* See also: @see <a href="http://snpeff.sourceforge.net/">SNPEff project page</a>
|
||||
* </p>
|
||||
*
|
||||
* @author David Roazen
|
||||
* @since 2011
|
||||
*/
|
||||
public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec {
|
||||
|
||||
public static final int EXPECTED_NUMBER_OF_FIELDS = 23;
|
||||
public static final String FIELD_DELIMITER_PATTERN = "\\t";
|
||||
public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]";
|
||||
public static final String HEADER_LINE_START = "# ";
|
||||
public static final String[] HEADER_FIELD_NAMES = { "Chromo",
|
||||
"Position",
|
||||
"Reference",
|
||||
"Change",
|
||||
"Change type",
|
||||
"Homozygous",
|
||||
"Quality",
|
||||
"Coverage",
|
||||
"Warnings",
|
||||
"Gene_ID",
|
||||
"Gene_name",
|
||||
"Bio_type",
|
||||
"Trancript_ID", // yes, this is how it's spelled in the SnpEff output
|
||||
"Exon_ID",
|
||||
"Exon_Rank",
|
||||
"Effect",
|
||||
"old_AA/new_AA",
|
||||
"Old_codon/New_codon",
|
||||
"Codon_Num(CDS)",
|
||||
"CDS_size",
|
||||
"Codons around",
|
||||
"AAs around",
|
||||
"Custom_interval_ID"
|
||||
};
|
||||
|
||||
// The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line:
|
||||
public static final int[] REQUIRED_FIELDS = { 0, 1, 15 };
|
||||
|
||||
public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE";
|
||||
|
||||
|
||||
public Feature decodeLoc ( String line ) {
|
||||
return decode(line);
|
||||
}
|
||||
|
||||
public Feature decode ( String line ) {
|
||||
String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1);
|
||||
|
||||
if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
|
||||
throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS +
|
||||
") number of fields: found " + tokens.length + " fields.", line);
|
||||
}
|
||||
|
||||
try {
|
||||
trimAllFields(tokens);
|
||||
checkForRequiredFields(tokens, line);
|
||||
|
||||
String contig = tokens[0];
|
||||
long position = Long.parseLong(tokens[1]);
|
||||
|
||||
String reference = tokens[2].isEmpty() ? null : tokens[2];
|
||||
String change = tokens[3].isEmpty() ? null : tokens[3];
|
||||
ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]);
|
||||
Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]);
|
||||
Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]);
|
||||
Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]);
|
||||
String warnings = tokens[8].isEmpty() ? null : tokens[8];
|
||||
String geneID = tokens[9].isEmpty() ? null : tokens[9];
|
||||
String geneName = tokens[10].isEmpty() ? null : tokens[10];
|
||||
String bioType = tokens[11].isEmpty() ? null : tokens[11];
|
||||
String transcriptID = tokens[12].isEmpty() ? null : tokens[12];
|
||||
String exonID = tokens[13].isEmpty() ? null : tokens[13];
|
||||
Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]);
|
||||
|
||||
boolean isNonCodingGene = isNonCodingGene(tokens[15]);
|
||||
|
||||
// Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present,
|
||||
// otherwise split it into two subfields. We need this limit to prevent the extra effect-related information
|
||||
// in the final field (when present) from being inappropriately tokenized:
|
||||
|
||||
int effectFieldTokenLimit = isNonCodingGene ? 3 : 2;
|
||||
String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit);
|
||||
EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene);
|
||||
String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene);
|
||||
|
||||
String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16];
|
||||
String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17];
|
||||
Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]);
|
||||
Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]);
|
||||
String codonsAround = tokens[20].isEmpty() ? null : tokens[20];
|
||||
String aasAround = tokens[21].isEmpty() ? null : tokens[21];
|
||||
String customIntervalID = tokens[22].isEmpty() ? null : tokens[22];
|
||||
|
||||
return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage,
|
||||
warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene,
|
||||
effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize,
|
||||
codonsAround, aasAround, customIntervalID);
|
||||
}
|
||||
catch ( NumberFormatException e ) {
|
||||
throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line);
|
||||
}
|
||||
}
|
||||
|
||||
private void trimAllFields ( String[] tokens ) {
|
||||
for ( int i = 0; i < tokens.length; i++ ) {
|
||||
tokens[i] = tokens[i].trim();
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForRequiredFields ( String[] tokens, String line ) {
|
||||
for ( int requiredFieldIndex : REQUIRED_FIELDS ) {
|
||||
if ( tokens[requiredFieldIndex].isEmpty() ) {
|
||||
throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" +
|
||||
HEADER_FIELD_NAMES[requiredFieldIndex] + "\"",
|
||||
line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isNonCodingGene ( String effectField ) {
|
||||
return effectField.startsWith(NON_CODING_GENE_FLAG);
|
||||
}
|
||||
|
||||
private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) {
|
||||
String effectName = "";
|
||||
|
||||
// If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield,
|
||||
// otherwise it will be in the first subfield:
|
||||
|
||||
if ( effectFieldTokens.length > 1 && isNonCodingGene ) {
|
||||
effectName = effectFieldTokens[1].trim();
|
||||
}
|
||||
else {
|
||||
effectName = effectFieldTokens[0].trim();
|
||||
}
|
||||
|
||||
return EffectType.valueOf(effectName);
|
||||
}
|
||||
|
||||
private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) {
|
||||
|
||||
// The extra effect-related information, if present, will always be the last subfield:
|
||||
|
||||
if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) {
|
||||
return effectFieldTokens[effectFieldTokens.length - 1].trim();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public Class<SnpEffFeature> getFeatureType() {
|
||||
return SnpEffFeature.class;
|
||||
}
|
||||
|
||||
public Object readHeader ( LineReader reader ) {
|
||||
String headerLine = "";
|
||||
|
||||
try {
|
||||
headerLine = reader.readLine();
|
||||
}
|
||||
catch ( IOException e ) {
|
||||
throw new TribbleException("Unable to read header line from input file.");
|
||||
}
|
||||
|
||||
validateHeaderLine(headerLine);
|
||||
return headerLine;
|
||||
}
|
||||
|
||||
private void validateHeaderLine ( String headerLine ) {
|
||||
if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) {
|
||||
throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START);
|
||||
}
|
||||
|
||||
String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN);
|
||||
|
||||
if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
|
||||
throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" +
|
||||
EXPECTED_NUMBER_OF_FIELDS + ") of columns.");
|
||||
}
|
||||
|
||||
for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) {
|
||||
if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) {
|
||||
throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" +
|
||||
HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" +
|
||||
headerTokens[columnIndex] + "\"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean canDecode ( final File potentialInput ) {
|
||||
try {
|
||||
LineReader reader = new AsciiLineReader(new FileInputStream(potentialInput));
|
||||
readHeader(reader);
|
||||
}
|
||||
catch ( Exception e ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,115 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.snpEff;
|
||||
|
||||
/**
|
||||
* A set of constants associated with the SnpEff codec.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SnpEffConstants {
|
||||
|
||||
// Possible SnpEff biological effects and their associated impacts:
|
||||
public enum EffectType {
|
||||
START_GAINED (EffectImpact.HIGH),
|
||||
START_LOST (EffectImpact.HIGH),
|
||||
EXON_DELETED (EffectImpact.HIGH),
|
||||
FRAME_SHIFT (EffectImpact.HIGH),
|
||||
STOP_GAINED (EffectImpact.HIGH),
|
||||
STOP_LOST (EffectImpact.HIGH),
|
||||
SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH),
|
||||
SPLICE_SITE_DONOR (EffectImpact.HIGH),
|
||||
|
||||
NON_SYNONYMOUS_CODING (EffectImpact.MODERATE),
|
||||
UTR_5_DELETED (EffectImpact.MODERATE),
|
||||
UTR_3_DELETED (EffectImpact.MODERATE),
|
||||
CODON_INSERTION (EffectImpact.MODERATE),
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE),
|
||||
CODON_DELETION (EffectImpact.MODERATE),
|
||||
CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE),
|
||||
|
||||
NONE (EffectImpact.LOW),
|
||||
CHROMOSOME (EffectImpact.LOW),
|
||||
INTERGENIC (EffectImpact.LOW),
|
||||
UPSTREAM (EffectImpact.LOW),
|
||||
UTR_5_PRIME (EffectImpact.LOW),
|
||||
SYNONYMOUS_START (EffectImpact.LOW),
|
||||
NON_SYNONYMOUS_START (EffectImpact.LOW),
|
||||
CDS (EffectImpact.LOW),
|
||||
GENE (EffectImpact.LOW),
|
||||
TRANSCRIPT (EffectImpact.LOW),
|
||||
EXON (EffectImpact.LOW),
|
||||
SYNONYMOUS_CODING (EffectImpact.LOW),
|
||||
CODON_CHANGE (EffectImpact.LOW),
|
||||
SYNONYMOUS_STOP (EffectImpact.LOW),
|
||||
NON_SYNONYMOUS_STOP (EffectImpact.LOW),
|
||||
INTRON (EffectImpact.LOW),
|
||||
UTR_3_PRIME (EffectImpact.LOW),
|
||||
DOWNSTREAM (EffectImpact.LOW),
|
||||
INTRON_CONSERVED (EffectImpact.LOW),
|
||||
INTERGENIC_CONSERVED (EffectImpact.LOW),
|
||||
CUSTOM (EffectImpact.LOW);
|
||||
|
||||
private final EffectImpact impact;
|
||||
|
||||
EffectType ( EffectImpact impact ) {
|
||||
this.impact = impact;
|
||||
}
|
||||
|
||||
public EffectImpact getImpact() {
|
||||
return impact;
|
||||
}
|
||||
}
|
||||
|
||||
public enum EffectImpact {
|
||||
LOW (1),
|
||||
MODERATE (2),
|
||||
HIGH (3);
|
||||
|
||||
private final int severityRating;
|
||||
|
||||
EffectImpact ( int severityRating ) {
|
||||
this.severityRating = severityRating;
|
||||
}
|
||||
|
||||
public boolean isHigherImpactThan ( EffectImpact other ) {
|
||||
return this.severityRating > other.severityRating;
|
||||
}
|
||||
}
|
||||
|
||||
// The kinds of variants supported by the SnpEff output format:
|
||||
public enum ChangeType {
|
||||
SNP,
|
||||
MNP,
|
||||
INS,
|
||||
DEL
|
||||
}
|
||||
|
||||
// Possible zygosities of SnpEff variants:
|
||||
public enum Zygosity {
|
||||
Hom,
|
||||
Het
|
||||
}
|
||||
}
|
||||
|
|
@ -1,423 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.snpEff;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact;
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
|
||||
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
|
||||
|
||||
/**
|
||||
* Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output.
|
||||
*
|
||||
* Many fields are optional, and missing values are represented by nulls. You should always call the
|
||||
* hasX() method before calling the corresponding getX() method. Required fields can never be null
|
||||
* and do not have a hasX() method.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SnpEffFeature implements Feature {
|
||||
|
||||
private String contig; // REQUIRED FIELD
|
||||
private long position; // REQUIRED FIELD
|
||||
private String reference;
|
||||
private String change;
|
||||
private ChangeType changeType;
|
||||
private Zygosity zygosity;
|
||||
private Double quality;
|
||||
private Long coverage;
|
||||
private String warnings;
|
||||
private String geneID;
|
||||
private String geneName;
|
||||
private String bioType;
|
||||
private String transcriptID;
|
||||
private String exonID;
|
||||
private Integer exonRank;
|
||||
private boolean isNonCodingGene; // REQUIRED FIELD
|
||||
private EffectType effect; // REQUIRED FIELD
|
||||
private String effectExtraInformation;
|
||||
private String oldAndNewAA;
|
||||
private String oldAndNewCodon;
|
||||
private Integer codonNum;
|
||||
private Integer cdsSize;
|
||||
private String codonsAround;
|
||||
private String aasAround;
|
||||
private String customIntervalID;
|
||||
|
||||
public SnpEffFeature ( String contig,
|
||||
long position,
|
||||
String reference,
|
||||
String change,
|
||||
ChangeType changeType,
|
||||
Zygosity zygosity,
|
||||
Double quality,
|
||||
Long coverage,
|
||||
String warnings,
|
||||
String geneID,
|
||||
String geneName,
|
||||
String bioType,
|
||||
String transcriptID,
|
||||
String exonID,
|
||||
Integer exonRank,
|
||||
boolean isNonCodingGene,
|
||||
EffectType effect,
|
||||
String effectExtraInformation,
|
||||
String oldAndNewAA,
|
||||
String oldAndNewCodon,
|
||||
Integer codonNum,
|
||||
Integer cdsSize,
|
||||
String codonsAround,
|
||||
String aasAround,
|
||||
String customIntervalID ) {
|
||||
|
||||
if ( contig == null || effect == null ) {
|
||||
throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields");
|
||||
}
|
||||
|
||||
this.contig = contig;
|
||||
this.position = position;
|
||||
this.reference = reference;
|
||||
this.change = change;
|
||||
this.changeType = changeType;
|
||||
this.zygosity = zygosity;
|
||||
this.quality = quality;
|
||||
this.coverage = coverage;
|
||||
this.warnings = warnings;
|
||||
this.geneID = geneID;
|
||||
this.geneName = geneName;
|
||||
this.bioType = bioType;
|
||||
this.transcriptID = transcriptID;
|
||||
this.exonID = exonID;
|
||||
this.exonRank = exonRank;
|
||||
this.isNonCodingGene = isNonCodingGene;
|
||||
this.effect = effect;
|
||||
this.effectExtraInformation = effectExtraInformation;
|
||||
this.oldAndNewAA = oldAndNewAA;
|
||||
this.oldAndNewCodon = oldAndNewCodon;
|
||||
this.codonNum = codonNum;
|
||||
this.cdsSize = cdsSize;
|
||||
this.codonsAround = codonsAround;
|
||||
this.aasAround = aasAround;
|
||||
this.customIntervalID = customIntervalID;
|
||||
}
|
||||
|
||||
public boolean isHigherImpactThan ( SnpEffFeature other ) {
|
||||
|
||||
// If one effect is in a non-coding gene and the other is not, the effect NOT in the
|
||||
// non-coding gene has higher impact:
|
||||
|
||||
if ( ! isNonCodingGene() && other.isNonCodingGene() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( isNonCodingGene() && ! other.isNonCodingGene() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts
|
||||
// of the effects themselves as defined in the SnpEffConstants class:
|
||||
|
||||
return getEffectImpact().isHigherImpactThan(other.getEffectImpact());
|
||||
}
|
||||
|
||||
public String getChr() {
|
||||
return contig;
|
||||
}
|
||||
|
||||
public int getStart() {
|
||||
return (int)position;
|
||||
}
|
||||
|
||||
public int getEnd() {
|
||||
return (int)position;
|
||||
}
|
||||
|
||||
public boolean hasReference() {
|
||||
return reference != null;
|
||||
}
|
||||
|
||||
public String getReference() {
|
||||
if ( reference == null ) throw new NoSuchElementException("This feature has no reference field");
|
||||
return reference;
|
||||
}
|
||||
|
||||
public boolean hasChange() {
|
||||
return change != null;
|
||||
}
|
||||
|
||||
public String getChange() {
|
||||
if ( change == null ) throw new NoSuchElementException("This feature has no change field");
|
||||
return change;
|
||||
}
|
||||
|
||||
public boolean hasChangeType() {
|
||||
return changeType != null;
|
||||
}
|
||||
|
||||
public ChangeType getChangeType() {
|
||||
if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field");
|
||||
return changeType;
|
||||
}
|
||||
|
||||
public boolean hasZygosity() {
|
||||
return zygosity != null;
|
||||
}
|
||||
|
||||
public Zygosity getZygosity() {
|
||||
if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field");
|
||||
return zygosity;
|
||||
}
|
||||
|
||||
public boolean hasQuality() {
|
||||
return quality != null;
|
||||
}
|
||||
|
||||
public Double getQuality() {
|
||||
if ( quality == null ) throw new NoSuchElementException("This feature has no quality field");
|
||||
return quality;
|
||||
}
|
||||
|
||||
public boolean hasCoverage() {
|
||||
return coverage != null;
|
||||
}
|
||||
|
||||
public Long getCoverage() {
|
||||
if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field");
|
||||
return coverage;
|
||||
}
|
||||
|
||||
public boolean hasWarnings() {
|
||||
return warnings != null;
|
||||
}
|
||||
|
||||
public String getWarnings() {
|
||||
if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field");
|
||||
return warnings;
|
||||
}
|
||||
|
||||
public boolean hasGeneID() {
|
||||
return geneID != null;
|
||||
}
|
||||
|
||||
public String getGeneID() {
|
||||
if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field");
|
||||
return geneID;
|
||||
}
|
||||
|
||||
public boolean hasGeneName() {
|
||||
return geneName != null;
|
||||
}
|
||||
|
||||
public String getGeneName() {
|
||||
if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field");
|
||||
return geneName;
|
||||
}
|
||||
|
||||
public boolean hasBioType() {
|
||||
return bioType != null;
|
||||
}
|
||||
|
||||
public String getBioType() {
|
||||
if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field");
|
||||
return bioType;
|
||||
}
|
||||
|
||||
public boolean hasTranscriptID() {
|
||||
return transcriptID != null;
|
||||
}
|
||||
|
||||
public String getTranscriptID() {
|
||||
if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field");
|
||||
return transcriptID;
|
||||
}
|
||||
|
||||
public boolean hasExonID() {
|
||||
return exonID != null;
|
||||
}
|
||||
|
||||
public String getExonID() {
|
||||
if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field");
|
||||
return exonID;
|
||||
}
|
||||
|
||||
public boolean hasExonRank() {
|
||||
return exonRank != null;
|
||||
}
|
||||
|
||||
public Integer getExonRank() {
|
||||
if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field");
|
||||
return exonRank;
|
||||
}
|
||||
|
||||
public boolean isNonCodingGene() {
|
||||
return isNonCodingGene;
|
||||
}
|
||||
|
||||
public EffectType getEffect() {
|
||||
return effect;
|
||||
}
|
||||
|
||||
public EffectImpact getEffectImpact() {
|
||||
return effect.getImpact();
|
||||
}
|
||||
|
||||
public boolean hasEffectExtraInformation() {
|
||||
return effectExtraInformation != null;
|
||||
}
|
||||
|
||||
public String getEffectExtraInformation() {
|
||||
if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field");
|
||||
return effectExtraInformation;
|
||||
}
|
||||
|
||||
public boolean hasOldAndNewAA() {
|
||||
return oldAndNewAA != null;
|
||||
}
|
||||
|
||||
public String getOldAndNewAA() {
|
||||
if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field");
|
||||
return oldAndNewAA;
|
||||
}
|
||||
|
||||
public boolean hasOldAndNewCodon() {
|
||||
return oldAndNewCodon != null;
|
||||
}
|
||||
|
||||
public String getOldAndNewCodon() {
|
||||
if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field");
|
||||
return oldAndNewCodon;
|
||||
}
|
||||
|
||||
public boolean hasCodonNum() {
|
||||
return codonNum != null;
|
||||
}
|
||||
|
||||
public Integer getCodonNum() {
|
||||
if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field");
|
||||
return codonNum;
|
||||
}
|
||||
|
||||
public boolean hasCdsSize() {
|
||||
return cdsSize != null;
|
||||
}
|
||||
|
||||
public Integer getCdsSize() {
|
||||
if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field");
|
||||
return cdsSize;
|
||||
}
|
||||
|
||||
public boolean hasCodonsAround() {
|
||||
return codonsAround != null;
|
||||
}
|
||||
|
||||
public String getCodonsAround() {
|
||||
if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field");
|
||||
return codonsAround;
|
||||
}
|
||||
|
||||
public boolean hadAasAround() {
|
||||
return aasAround != null;
|
||||
}
|
||||
|
||||
public String getAasAround() {
|
||||
if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field");
|
||||
return aasAround;
|
||||
}
|
||||
|
||||
public boolean hasCustomIntervalID() {
|
||||
return customIntervalID != null;
|
||||
}
|
||||
|
||||
public String getCustomIntervalID() {
|
||||
if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field");
|
||||
return customIntervalID;
|
||||
}
|
||||
|
||||
public boolean equals ( Object o ) {
|
||||
if ( o == null || ! (o instanceof SnpEffFeature) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
SnpEffFeature other = (SnpEffFeature)o;
|
||||
|
||||
return contig.equals(other.contig) &&
|
||||
position == other.position &&
|
||||
(reference == null ? other.reference == null : reference.equals(other.reference)) &&
|
||||
(change == null ? other.change == null : change.equals(other.change)) &&
|
||||
changeType == other.changeType &&
|
||||
zygosity == other.zygosity &&
|
||||
(quality == null ? other.quality == null : quality.equals(other.quality)) &&
|
||||
(coverage == null ? other.coverage == null : coverage.equals(other.coverage)) &&
|
||||
(warnings == null ? other.warnings == null : warnings.equals(other.warnings)) &&
|
||||
(geneID == null ? other.geneID == null : geneID.equals(other.geneID)) &&
|
||||
(geneName == null ? other.geneName == null : geneName.equals(other.geneName)) &&
|
||||
(bioType == null ? other.bioType == null : bioType.equals(other.bioType)) &&
|
||||
(transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) &&
|
||||
(exonID == null ? other.exonID == null : exonID.equals(other.exonID)) &&
|
||||
(exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) &&
|
||||
isNonCodingGene == other.isNonCodingGene &&
|
||||
effect == other.effect &&
|
||||
(effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) &&
|
||||
(oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) &&
|
||||
(oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) &&
|
||||
(codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) &&
|
||||
(cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) &&
|
||||
(codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) &&
|
||||
(aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) &&
|
||||
(customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID));
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "[Contig: " + contig +
|
||||
" Position: " + position +
|
||||
" Reference: " + reference +
|
||||
" Change: " + change +
|
||||
" Change Type: " + changeType +
|
||||
" Zygosity: " + zygosity +
|
||||
" Quality: " + quality +
|
||||
" Coverage: " + coverage +
|
||||
" Warnings: " + warnings +
|
||||
" Gene ID: " + geneID +
|
||||
" Gene Name: " + geneName +
|
||||
" Bio Type: " + bioType +
|
||||
" Transcript ID: " + transcriptID +
|
||||
" Exon ID: " + exonID +
|
||||
" Exon Rank: " + exonRank +
|
||||
" Non-Coding Gene: " + isNonCodingGene +
|
||||
" Effect: " + effect +
|
||||
" Effect Extra Information: " + effectExtraInformation +
|
||||
" Old/New AA: " + oldAndNewAA +
|
||||
" Old/New Codon: " + oldAndNewCodon +
|
||||
" Codon Num: " + codonNum +
|
||||
" CDS Size: " + cdsSize +
|
||||
" Codons Around: " + codonsAround +
|
||||
" AAs Around: " + aasAround +
|
||||
" Custom Interval ID: " + customIntervalID +
|
||||
"]";
|
||||
}
|
||||
}
|
||||
|
|
@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
throw new UserException.MalformedVCF(message, lineNo);
|
||||
}
|
||||
|
||||
private static void generateException(String message, int lineNo) {
|
||||
protected static void generateException(String message, int lineNo) {
|
||||
throw new UserException.MalformedVCF(message, lineNo);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,144 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.index.DynamicIndexCreator;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broad.tribble.util.PositionalStream;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
public abstract class IndexingVCFWriter implements VCFWriter {
|
||||
final private String name;
|
||||
private final SAMSequenceDictionary refDict;
|
||||
|
||||
private OutputStream outputStream;
|
||||
private PositionalStream positionalStream = null;
|
||||
private DynamicIndexCreator indexer = null;
|
||||
private LittleEndianOutputStream idxStream = null;
|
||||
|
||||
@Requires({"name != null",
|
||||
"! ( location == null && output == null )",
|
||||
"! ( enableOnTheFlyIndexing && location == null )"})
|
||||
protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
|
||||
outputStream = output;
|
||||
this.name = name;
|
||||
this.refDict = refDict;
|
||||
|
||||
if ( enableOnTheFlyIndexing ) {
|
||||
try {
|
||||
idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
|
||||
//System.out.println("Creating index on the fly for " + location);
|
||||
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
indexer.initialize(location, indexer.defaultBinSize());
|
||||
positionalStream = new PositionalStream(output);
|
||||
outputStream = positionalStream;
|
||||
} catch ( IOException ex ) {
|
||||
// No matter what we keep going, since we don't care if we can't create the index file
|
||||
idxStream = null;
|
||||
indexer = null;
|
||||
positionalStream = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public OutputStream getOutputStream() {
|
||||
return outputStream;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public String getStreamName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public abstract void writeHeader(VCFHeader header);
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
public void close() {
|
||||
// try to close the index stream (keep it separate to help debugging efforts)
|
||||
if ( indexer != null ) {
|
||||
try {
|
||||
Index index = indexer.finalizeIndex(positionalStream.getPosition());
|
||||
IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
|
||||
index.write(idxStream);
|
||||
idxStream.close();
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
* @param vc the Variant Context object
|
||||
*/
|
||||
public void add(VariantContext vc) {
|
||||
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
|
||||
if ( indexer != null )
|
||||
indexer.addFeature(vc, positionalStream.getPosition());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reasonable "name" for this writer, to display to the user if something goes wrong
|
||||
*
|
||||
* @param location
|
||||
* @param stream
|
||||
* @return
|
||||
*/
|
||||
protected static final String writerName(final File location, final OutputStream stream) {
|
||||
return location == null ? stream.toString() : location.getAbsolutePath();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a output stream writing to location, or throws a UserException if this fails
|
||||
* @param location
|
||||
* @return
|
||||
*/
|
||||
protected static OutputStream openOutputStream(final File location) {
|
||||
try {
|
||||
return new FileOutputStream(location);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.index.DynamicIndexCreator;
|
||||
|
|
@ -44,46 +45,30 @@ import java.util.*;
|
|||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
public class StandardVCFWriter implements VCFWriter {
|
||||
public class StandardVCFWriter extends IndexingVCFWriter {
|
||||
// the print stream we're writing to
|
||||
final protected BufferedWriter mWriter;
|
||||
|
||||
// should we write genotypes or just sites?
|
||||
final protected boolean doNotWriteGenotypes;
|
||||
|
||||
// the VCF header we're storing
|
||||
protected VCFHeader mHeader = null;
|
||||
|
||||
// the print stream we're writing to
|
||||
protected BufferedWriter mWriter;
|
||||
protected PositionalStream positionalStream = null;
|
||||
|
||||
// were filters applied?
|
||||
protected boolean filtersWereAppliedToContext = false;
|
||||
|
||||
// should we write genotypes or just sites?
|
||||
protected boolean doNotWriteGenotypes = false;
|
||||
|
||||
protected DynamicIndexCreator indexer = null;
|
||||
protected File indexFile = null;
|
||||
LittleEndianOutputStream idxStream = null;
|
||||
File location = null;
|
||||
|
||||
/**
|
||||
* create a VCF writer, given a file to write to
|
||||
*
|
||||
* @param location the file location to write to
|
||||
*/
|
||||
public StandardVCFWriter(File location) {
|
||||
this(location, openOutputStream(location), true, false);
|
||||
public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) {
|
||||
this(location, openOutputStream(location), refDict, true, false);
|
||||
}
|
||||
|
||||
public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) {
|
||||
this(location, openOutputStream(location), enableOnTheFlyIndexing, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF writer, given a stream to write to
|
||||
*
|
||||
* @param output the file location to write to
|
||||
*/
|
||||
public StandardVCFWriter(OutputStream output) {
|
||||
this(output, false);
|
||||
public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) {
|
||||
this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -92,33 +77,23 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
* @param output the file location to write to
|
||||
* @param doNotWriteGenotypes do not write genotypes
|
||||
*/
|
||||
public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) {
|
||||
mWriter = new BufferedWriter(new OutputStreamWriter(output));
|
||||
public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) {
|
||||
this(null, output, refDict, false, doNotWriteGenotypes);
|
||||
}
|
||||
|
||||
public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
|
||||
mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
}
|
||||
|
||||
public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
this.location = location;
|
||||
|
||||
if ( enableOnTheFlyIndexing ) {
|
||||
indexFile = Tribble.indexFile(location);
|
||||
try {
|
||||
idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
//System.out.println("Creating index on the fly for " + location);
|
||||
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
indexer.initialize(location, indexer.defaultBinSize());
|
||||
positionalStream = new PositionalStream(output);
|
||||
output = positionalStream;
|
||||
} catch ( IOException ex ) {
|
||||
// No matter what we keep going, since we don't care if we can't create the index file
|
||||
}
|
||||
}
|
||||
|
||||
//mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output)));
|
||||
mWriter = new BufferedWriter(new OutputStreamWriter(output));
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
}
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// VCFWriter interface functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
|
||||
|
||||
|
|
@ -158,44 +133,24 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new TribbleException("IOException writing the VCF header to " + locationString(), e);
|
||||
throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private String locationString() {
|
||||
return location == null ? mWriter.toString() : location.getAbsolutePath();
|
||||
}
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
// try to close the vcf stream
|
||||
try {
|
||||
mWriter.flush();
|
||||
mWriter.close();
|
||||
} catch (IOException e) {
|
||||
throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage());
|
||||
throw new ReviewedStingException("Unable to close " + getStreamName(), e);
|
||||
}
|
||||
|
||||
// try to close the index stream (keep it separate to help debugging efforts)
|
||||
if ( indexer != null ) {
|
||||
try {
|
||||
Index index = indexer.finalizeIndex(positionalStream.getPosition());
|
||||
index.write(idxStream);
|
||||
idxStream.close();
|
||||
} catch (IOException e) {
|
||||
throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static OutputStream openOutputStream(File location) {
|
||||
try {
|
||||
return new FileOutputStream(location);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new TribbleException("Unable to create VCF file at location: " + location);
|
||||
}
|
||||
super.close();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -203,28 +158,17 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
*
|
||||
* @param vc the Variant Context object
|
||||
*/
|
||||
@Override
|
||||
public void add(VariantContext vc) {
|
||||
add(vc, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
* @param vc the Variant Context object
|
||||
* @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD)
|
||||
*/
|
||||
public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) {
|
||||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString());
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());
|
||||
|
||||
if ( doNotWriteGenotypes )
|
||||
vc = VariantContext.modifyGenotypes(vc, null);
|
||||
|
||||
try {
|
||||
vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles);
|
||||
|
||||
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
|
||||
if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition());
|
||||
vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false);
|
||||
super.add(vc);
|
||||
|
||||
Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
|
||||
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
|
||||
|
|
@ -275,7 +219,7 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// FILTER
|
||||
String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
|
||||
String filters = getFilterString(vc, filtersWereAppliedToContext);
|
||||
mWriter.write(filters);
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
|
|
@ -317,9 +261,22 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
mWriter.write("\n");
|
||||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to write the VCF object to " + locationString());
|
||||
throw new RuntimeException("Unable to write the VCF object to " + getStreamName());
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// implementation functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public static final String getFilterString(final VariantContext vc) {
|
||||
return getFilterString(vc, false);
|
||||
}
|
||||
|
||||
public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
|
||||
return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
|
||||
}
|
||||
|
||||
private String getQualValue(double qual) {
|
||||
|
|
@ -462,7 +419,7 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
mWriter.write(encoding);
|
||||
}
|
||||
|
||||
private static String formatVCFField(Object val) {
|
||||
public static String formatVCFField(Object val) {
|
||||
String result;
|
||||
if ( val == null )
|
||||
result = VCFConstants.MISSING_VALUE_v4;
|
||||
|
|
@ -524,12 +481,11 @@ public class StandardVCFWriter implements VCFWriter {
|
|||
}
|
||||
|
||||
|
||||
public static int countOccurrences(char c, String s) {
|
||||
private static int countOccurrences(char c, String s) {
|
||||
int count = 0;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
count += s.charAt(i) == c ? 1 : 0;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -105,34 +105,37 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
|
||||
*/
|
||||
protected Set<String> parseFilters(String filterString) {
|
||||
return parseFilters(filterHash, lineNo, filterString);
|
||||
}
|
||||
|
||||
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
|
||||
// null for unfiltered
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
||||
return null;
|
||||
|
||||
// empty set for passes filters
|
||||
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
|
||||
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
|
||||
return fFields;
|
||||
return Collections.emptySet();
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
|
||||
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4");
|
||||
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
|
||||
if ( filterString.length() == 0 )
|
||||
generateException("The VCF specification requires a valid filter status");
|
||||
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
|
||||
|
||||
// do we have the filter string cached?
|
||||
if ( filterHash.containsKey(filterString) )
|
||||
return filterHash.get(filterString);
|
||||
if ( cache != null && cache.containsKey(filterString) )
|
||||
return Collections.unmodifiableSet(cache.get(filterString));
|
||||
|
||||
// empty set for passes filters
|
||||
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
||||
filterHash.put(filterString, fFields);
|
||||
fFields = fFields;
|
||||
if ( cache != null ) cache.put(filterString, fFields);
|
||||
|
||||
return fFields;
|
||||
return Collections.unmodifiableSet(fFields);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ public class VCFHeader {
|
|||
private final Set<VCFHeaderLine> mMetaData;
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
|
||||
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
|
||||
|
||||
// the list of auxillary tags
|
||||
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
|
||||
|
|
@ -110,6 +111,9 @@ public class VCFHeader {
|
|||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||
mFormatMetaData.put(formatLine.getName(), formatLine);
|
||||
}
|
||||
else {
|
||||
mOtherMetaData.put(line.getKey(), line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -185,6 +189,14 @@ public class VCFHeader {
|
|||
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
|
||||
return mFormatMetaData.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFHeaderLine getOtherHeaderLine(String key) {
|
||||
return mOtherMetaData.get(key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,256 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* GATK binary VCF record
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class GCF {
|
||||
private final static int RECORD_TERMINATOR = 123456789;
|
||||
private int chromOffset;
|
||||
private int start, stop;
|
||||
private String id;
|
||||
private List<Allele> alleleMap;
|
||||
private int alleleOffsets[];
|
||||
private float qual;
|
||||
private byte refPad;
|
||||
private String info;
|
||||
private int filterOffset;
|
||||
|
||||
private List<GCFGenotype> genotypes = Collections.emptyList();
|
||||
|
||||
public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) {
|
||||
chromOffset = GCFHeaderBuilder.encodeString(vc.getChr());
|
||||
start = vc.getStart();
|
||||
stop = vc.getEnd();
|
||||
refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0;
|
||||
id = vc.getID();
|
||||
|
||||
// encode alleles
|
||||
alleleMap = new ArrayList<Allele>(vc.getNAlleles());
|
||||
alleleOffsets = new int[vc.getNAlleles()];
|
||||
alleleMap.add(vc.getReference());
|
||||
alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference());
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
alleleMap.add(vc.getAlternateAllele(i));
|
||||
alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i));
|
||||
}
|
||||
|
||||
qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual());
|
||||
info = infoFieldString(vc, GCFHeaderBuilder);
|
||||
filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc));
|
||||
|
||||
if ( ! skipGenotypes ) {
|
||||
genotypes = encodeGenotypes(GCFHeaderBuilder, vc);
|
||||
}
|
||||
}
|
||||
|
||||
public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
|
||||
chromOffset = inputStream.readInt();
|
||||
|
||||
// have we reached the footer?
|
||||
if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
|
||||
throw new EOFException();
|
||||
|
||||
start = inputStream.readInt();
|
||||
stop = inputStream.readInt();
|
||||
id = inputStream.readUTF();
|
||||
refPad = inputStream.readByte();
|
||||
alleleOffsets = readIntArray(inputStream);
|
||||
qual = inputStream.readFloat();
|
||||
info = inputStream.readUTF();
|
||||
filterOffset = inputStream.readInt();
|
||||
|
||||
int nGenotypes = inputStream.readInt();
|
||||
int sizeOfGenotypes = inputStream.readInt();
|
||||
if ( skipGenotypes ) {
|
||||
genotypes = Collections.emptyList();
|
||||
inputStream.skipBytes(sizeOfGenotypes);
|
||||
} else {
|
||||
genotypes = new ArrayList<GCFGenotype>(nGenotypes);
|
||||
for ( int i = 0; i < nGenotypes; i++ )
|
||||
genotypes.add(new GCFGenotype(this, inputStream));
|
||||
}
|
||||
|
||||
int recordDone = inputStream.readInt();
|
||||
if ( recordDone != RECORD_TERMINATOR )
|
||||
throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
|
||||
}
|
||||
|
||||
public int write(DataOutputStream outputStream) throws IOException {
|
||||
int startSize = outputStream.size();
|
||||
outputStream.writeInt(chromOffset);
|
||||
outputStream.writeInt(start);
|
||||
outputStream.writeInt(stop);
|
||||
outputStream.writeUTF(id);
|
||||
outputStream.writeByte(refPad);
|
||||
writeIntArray(alleleOffsets, outputStream, true);
|
||||
outputStream.writeFloat(qual);
|
||||
outputStream.writeUTF(info);
|
||||
outputStream.writeInt(filterOffset);
|
||||
|
||||
int nGenotypes = genotypes.size();
|
||||
int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
|
||||
outputStream.writeInt(nGenotypes);
|
||||
outputStream.writeInt(expectedSizeOfGenotypes);
|
||||
int obsSizeOfGenotypes = 0;
|
||||
for ( GCFGenotype g : genotypes )
|
||||
obsSizeOfGenotypes += g.write(outputStream);
|
||||
if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
|
||||
throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
|
||||
|
||||
outputStream.writeInt(RECORD_TERMINATOR);
|
||||
return outputStream.size() - startSize;
|
||||
}
|
||||
|
||||
public VariantContext decode(final String source, final GCFHeader header) {
|
||||
final String contig = header.getString(chromOffset);
|
||||
alleleMap = header.getAlleles(alleleOffsets);
|
||||
double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual);
|
||||
Set<String> filters = header.getFilters(filterOffset);
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put("INFO", info);
|
||||
Byte refPadByte = refPad == 0 ? null : refPad;
|
||||
Map<String, Genotype> genotypes = decodeGenotypes(header);
|
||||
|
||||
return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte);
|
||||
}
|
||||
|
||||
private Map<String, Genotype> decodeGenotypes(final GCFHeader header) {
|
||||
if ( genotypes.isEmpty() )
|
||||
return VariantContext.NO_GENOTYPES;
|
||||
else {
|
||||
Map<String, Genotype> map = new TreeMap<String, Genotype>();
|
||||
|
||||
for ( int i = 0; i < genotypes.size(); i++ ) {
|
||||
final String sampleName = header.getSample(i);
|
||||
final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap);
|
||||
map.put(sampleName, g);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
}
|
||||
|
||||
private List<GCFGenotype> encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) {
|
||||
int nGenotypes = vc.getNSamples();
|
||||
if ( nGenotypes > 0 ) {
|
||||
List<GCFGenotype> genotypes = new ArrayList<GCFGenotype>(nGenotypes);
|
||||
for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null);
|
||||
|
||||
for ( Genotype g : vc.getGenotypes().values() ) {
|
||||
int i = GCFHeaderBuilder.encodeSample(g.getSampleName());
|
||||
genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g));
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
} else {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
public int getNAlleles() { return alleleOffsets.length; }
|
||||
|
||||
|
||||
private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
|
||||
boolean first = true;
|
||||
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
|
||||
String key = field.getKey();
|
||||
if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
|
||||
continue;
|
||||
int stringIndex = GCFHeaderBuilder.encodeString(key);
|
||||
String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
|
||||
if ( outputValue != null ) {
|
||||
if ( ! first ) s.append(";");
|
||||
s.append(stringIndex).append("=").append(outputValue);
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
protected final static int BUFFER_SIZE = 1048576; // 2**20
|
||||
|
||||
public static DataInputStream createDataInputStream(final InputStream stream) {
|
||||
return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
|
||||
}
|
||||
|
||||
public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
|
||||
return new FileInputStream(file);
|
||||
}
|
||||
|
||||
protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
|
||||
return readIntArray(inputStream, inputStream.readInt());
|
||||
}
|
||||
|
||||
protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException {
|
||||
int[] array = new int[size];
|
||||
for ( int i = 0; i < array.length; i++ )
|
||||
array[i] = inputStream.readInt();
|
||||
return array;
|
||||
}
|
||||
|
||||
protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
|
||||
if ( writeSize ) outputStream.writeInt(array.length);
|
||||
for ( int i : array )
|
||||
outputStream.writeInt(i);
|
||||
}
|
||||
|
||||
protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException {
|
||||
return readByteArray(inputStream, inputStream.readInt());
|
||||
}
|
||||
|
||||
protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException {
|
||||
byte[] array = new byte[size];
|
||||
for ( int i = 0; i < array.length; i++ )
|
||||
array[i] = inputStream.readByte();
|
||||
return array;
|
||||
}
|
||||
|
||||
protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
|
||||
if ( writeSize ) outputStream.writeInt(array.length);
|
||||
for ( byte i : array )
|
||||
outputStream.writeByte(i);
|
||||
}
|
||||
|
||||
protected final static byte qualToByte(double phredScaledQual) {
|
||||
return (byte)Math.round(Math.min(phredScaledQual, 255));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* GATK binary VCF record
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class GCFGenotype {
|
||||
private byte gq;
|
||||
private int gt;
|
||||
private int dp;
|
||||
private int ad[];
|
||||
private byte[] pl;
|
||||
|
||||
// todo -- what to do about phasing? Perhaps we shouldn't support it
|
||||
// todo -- is the FL field generic or just a flag? Should we even support per sample filtering?
|
||||
|
||||
public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List<Allele> allAlleles, Genotype genotype) {
|
||||
gq = GCF.qualToByte(genotype.getPhredScaledQual());
|
||||
gt = encodeAlleles(genotype.getAlleles(), allAlleles);
|
||||
|
||||
dp = genotype.getAttributeAsInt("DP", 0);
|
||||
|
||||
int nAlleles = allAlleles.size();
|
||||
ad = new int[nAlleles];
|
||||
|
||||
int npls = nAllelesToNPls(nAlleles);
|
||||
pl = new byte[npls];
|
||||
}
|
||||
|
||||
private int nAllelesToNPls( int nAlleles ) {
|
||||
return nAlleles*(nAlleles+1) / 2;
|
||||
}
|
||||
|
||||
public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException {
|
||||
int gqInt = inputStream.readUnsignedByte();
|
||||
gq = (byte)gqInt;
|
||||
gt = inputStream.readInt();
|
||||
dp = inputStream.readInt();
|
||||
ad = GCF.readIntArray(inputStream, GCF.getNAlleles());
|
||||
pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles()));
|
||||
}
|
||||
|
||||
// 2 alleles => 1 + 8 + 8 + 3 => 20
|
||||
protected int sizeInBytes() {
|
||||
return 1 // gq
|
||||
+ 4 * 2 // gt + dp
|
||||
+ 4 * ad.length // ad
|
||||
+ 1 * pl.length; // pl
|
||||
}
|
||||
|
||||
public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List<Allele> alleleIndex) {
|
||||
final List<Allele> alleles = decodeAlleles(gt, alleleIndex);
|
||||
final double negLog10PError = gq / 10.0;
|
||||
final Set<String> filters = Collections.emptySet();
|
||||
final Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put("DP", dp);
|
||||
attributes.put("AD", ad);
|
||||
attributes.put("PL", pl);
|
||||
|
||||
return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false);
|
||||
}
|
||||
|
||||
private static int encodeAlleles(List<Allele> gtList, List<Allele> allAlleles) {
|
||||
final int nAlleles = gtList.size();
|
||||
if ( nAlleles > 4 )
|
||||
throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList);
|
||||
|
||||
int gtInt = 0;
|
||||
for ( int i = 0; i < nAlleles ; i++ ) {
|
||||
final int bitOffset = i * 8;
|
||||
final int allelei = getAlleleIndex(gtList.get(i), allAlleles);
|
||||
final int gti = (allelei + 1) << bitOffset;
|
||||
gtInt = gtInt | gti;
|
||||
}
|
||||
|
||||
return gtInt;
|
||||
}
|
||||
|
||||
private static int getAlleleIndex(Allele q, List<Allele> allAlleles) {
|
||||
if ( q.isNoCall() )
|
||||
return 254;
|
||||
for ( int i = 0; i < allAlleles.size(); i++ )
|
||||
if ( q.equals(allAlleles.get(i)) )
|
||||
return i;
|
||||
throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles);
|
||||
}
|
||||
|
||||
private static List<Allele> decodeAlleles(int gtInt, List<Allele> alleleIndex) {
|
||||
List<Allele> alleles = new ArrayList<Allele>(4);
|
||||
|
||||
for ( int i = 0; i < 32; i += 8 ) {
|
||||
final int gi = (gtInt & (0x000000FF << i)) >> i;
|
||||
if ( gi != 0 ) {
|
||||
final int allelei = gi - 1;
|
||||
alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) );
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
public int write(DataOutputStream outputStream) throws IOException {
|
||||
int startSize = outputStream.size();
|
||||
outputStream.writeByte(gq);
|
||||
outputStream.writeInt(gt);
|
||||
outputStream.writeInt(dp);
|
||||
GCF.writeIntArray(ad, outputStream, false);
|
||||
GCF.writeByteArray(pl, outputStream, false);
|
||||
return outputStream.size() - startSize;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,205 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gcf;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* [Short one sentence description of this walker]
|
||||
* <p/>
|
||||
* <p>
|
||||
* [Functionality of this walker]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Input description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T $WalkerName
|
||||
* </pre>
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class GCFHeader {
|
||||
final protected static Logger logger = Logger.getLogger(GCFHeader.class);
|
||||
|
||||
public final static int GCF_VERSION = 1;
|
||||
public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes();
|
||||
public final static int FOOTER_START_MARKER = -1;
|
||||
public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version
|
||||
|
||||
final int version;
|
||||
long footerPosition;
|
||||
final List<Allele> alleles;
|
||||
final List<String> strings;
|
||||
final List<String> samples;
|
||||
final List<Set<String>> filters;
|
||||
|
||||
public GCFHeader(final Map<Allele, Integer> allelesIn, final Map<String, Integer> stringIn, final Map<String, Integer> samplesIn) {
|
||||
version = GCF_VERSION;
|
||||
footerPosition = 0;
|
||||
this.alleles = linearize(allelesIn);
|
||||
this.strings = linearize(stringIn);
|
||||
this.samples = linearize(samplesIn);
|
||||
this.filters = null; // not used with this constructor
|
||||
}
|
||||
|
||||
public GCFHeader(FileInputStream fileInputStream) throws IOException {
|
||||
DataInputStream inputStream = new DataInputStream(fileInputStream);
|
||||
byte[] headerTest = new byte[GCF_FILE_START_MARKER.length];
|
||||
inputStream.read(headerTest);
|
||||
if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) {
|
||||
throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest));
|
||||
} else {
|
||||
version = inputStream.readInt();
|
||||
logger.info("Read GCF version " + version);
|
||||
footerPosition = inputStream.readLong();
|
||||
logger.info("Read footer position of " + footerPosition);
|
||||
long lastPos = fileInputStream.getChannel().position();
|
||||
logger.info(" Last position is " + lastPos);
|
||||
|
||||
// seek to the footer
|
||||
fileInputStream.getChannel().position(footerPosition);
|
||||
if ( inputStream.readInt() != FOOTER_START_MARKER )
|
||||
throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker");
|
||||
alleles = stringsToAlleles(readStrings(inputStream));
|
||||
strings = readStrings(inputStream);
|
||||
samples = readStrings(inputStream);
|
||||
logger.info(String.format("Allele map of %d elements", alleles.size()));
|
||||
logger.info(String.format("String map of %d elements", strings.size()));
|
||||
logger.info(String.format("Sample map of %d elements", samples.size()));
|
||||
filters = initializeFilterCache();
|
||||
fileInputStream.getChannel().position(lastPos);
|
||||
}
|
||||
}
|
||||
|
||||
public static int writeHeader(final DataOutputStream outputStream) throws IOException {
|
||||
int startBytes = outputStream.size();
|
||||
outputStream.write(GCF_FILE_START_MARKER);
|
||||
outputStream.writeInt(GCF_VERSION);
|
||||
outputStream.writeLong(0);
|
||||
return outputStream.size() - startBytes;
|
||||
}
|
||||
|
||||
public int writeFooter(final DataOutputStream outputStream) throws IOException {
|
||||
int startBytes = outputStream.size();
|
||||
outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding
|
||||
write(outputStream, allelesToStrings(alleles));
|
||||
write(outputStream, strings);
|
||||
write(outputStream, samples);
|
||||
return outputStream.size() - startBytes;
|
||||
}
|
||||
|
||||
private void write(DataOutputStream outputStream, List<String> l) throws IOException {
|
||||
outputStream.writeInt(l.size());
|
||||
for ( String elt : l ) outputStream.writeUTF(elt);
|
||||
}
|
||||
|
||||
private List<String> allelesToStrings(List<Allele> alleles) {
|
||||
List<String> strings = new ArrayList<String>(alleles.size());
|
||||
for ( Allele allele : alleles ) strings.add(allele.toString());
|
||||
return strings;
|
||||
}
|
||||
|
||||
private List<Set<String>> initializeFilterCache() {
|
||||
// required to allow offset -> set lookup
|
||||
List<Set<String>> l = new ArrayList<Set<String>>(strings.size());
|
||||
for ( int i = 0; i < strings.size(); i++ ) l.add(null);
|
||||
return l;
|
||||
}
|
||||
|
||||
private static List<Allele> stringsToAlleles(final List<String> strings) {
|
||||
final List<Allele> alleles = new ArrayList<Allele>(strings.size());
|
||||
for ( String string : strings ) {
|
||||
boolean isRef = string.endsWith("*");
|
||||
if ( isRef ) string = string.substring(0, string.length() - 1);
|
||||
alleles.add(Allele.create(string, isRef));
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
||||
private static List<String> readStrings(final DataInputStream inputStream) throws IOException {
|
||||
final int nStrings = inputStream.readInt();
|
||||
|
||||
final List<String> strings = new ArrayList<String>(nStrings);
|
||||
for ( int i = 0; i < nStrings; i++ ) {
|
||||
strings.add(inputStream.readUTF());
|
||||
}
|
||||
|
||||
return strings;
|
||||
}
|
||||
|
||||
private static <T> List<T> linearize(final Map<T, Integer> map) {
|
||||
final ArrayList<T> l = new ArrayList<T>(map.size());
|
||||
for ( int i = 0; i < map.size(); i++ ) l.add(null);
|
||||
for ( final Map.Entry<T, Integer> elt : map.entrySet() )
|
||||
l.set(elt.getValue(), elt.getKey());
|
||||
return l;
|
||||
}
|
||||
|
||||
public String getSample(final int offset) { return samples.get(offset); }
|
||||
public String getString(final int offset) { return strings.get(offset); }
|
||||
public Allele getAllele(final int offset) { return alleles.get(offset); }
|
||||
public List<Allele> getAlleles(final int[] offsets) {
|
||||
final List<Allele> alleles = new ArrayList<Allele>(offsets.length);
|
||||
for ( int i : offsets ) alleles.add(getAllele(i));
|
||||
return alleles;
|
||||
}
|
||||
|
||||
public Set<String> getFilters(final int offset) {
|
||||
Set<String> cached = filters.get(offset);
|
||||
|
||||
if ( cached != null )
|
||||
return cached;
|
||||
else {
|
||||
final String filterString = getString(offset);
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
||||
return null; // UNFILTERED records are represented by null
|
||||
else {
|
||||
Set<String> set = VCFCodec.parseFilters(null, -1, filterString);
|
||||
filters.set(offset, set); // remember the result
|
||||
return set;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* [Short one sentence description of this walker]
|
||||
* <p/>
|
||||
* <p>
|
||||
* [Functionality of this walker]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Input description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T $WalkerName
|
||||
* </pre>
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class GCFHeaderBuilder {
|
||||
Map<Allele, Integer> alleles = new HashMap<Allele, Integer>();
|
||||
Map<String, Integer> strings = new HashMap<String, Integer>();
|
||||
Map<String, Integer> samples = new HashMap<String, Integer>();
|
||||
|
||||
public GCFHeader createHeader() {
|
||||
return new GCFHeader(alleles, strings, samples);
|
||||
}
|
||||
|
||||
public int encodeString(final String chr) { return encode(strings, chr); }
|
||||
public int encodeAllele(final Allele allele) { return encode(alleles, allele); }
|
||||
public int encodeSample(final String sampleName) { return encode(samples, sampleName); }
|
||||
|
||||
private <T> int encode(Map<T, Integer> map, T key) {
|
||||
Integer v = map.get(key);
|
||||
if ( v == null ) {
|
||||
v = map.size();
|
||||
map.put(key, v);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gcf;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* GCFWriter implementing the VCFWriter interface
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class GCFWriter extends IndexingVCFWriter {
|
||||
final boolean skipGenotypes;
|
||||
final FileOutputStream fileOutputStream;
|
||||
final DataOutputStream dataOutputStream;
|
||||
final GCFHeaderBuilder gcfHeaderBuilder;
|
||||
int nbytes = 0;
|
||||
VCFHeader header = null;
|
||||
File location;
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing);
|
||||
this.location = location;
|
||||
this.skipGenotypes = doNotWriteGenotypes;
|
||||
|
||||
// write the output
|
||||
try {
|
||||
fileOutputStream = new FileOutputStream(location);
|
||||
dataOutputStream = createDataOutputStream(fileOutputStream);
|
||||
gcfHeaderBuilder = new GCFHeaderBuilder();
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(location, e);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// VCFWriter interface functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
this.header = header;
|
||||
try {
|
||||
nbytes += GCFHeader.writeHeader(dataOutputStream);
|
||||
} catch ( IOException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(VariantContext vc) {
|
||||
super.add(vc);
|
||||
GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes);
|
||||
try {
|
||||
nbytes += gcf.write(dataOutputStream);
|
||||
} catch ( IOException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// todo -- write out VCF header lines
|
||||
GCFHeader gcfHeader = gcfHeaderBuilder.createHeader();
|
||||
try {
|
||||
long headerPosition = nbytes;
|
||||
nbytes += gcfHeader.writeFooter(dataOutputStream);
|
||||
dataOutputStream.close();
|
||||
//System.out.println("Writing forward reference to " + headerPosition);
|
||||
|
||||
RandomAccessFile raFile = new RandomAccessFile(location, "rw");
|
||||
raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET);
|
||||
raFile.writeLong(headerPosition);
|
||||
raFile.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e);
|
||||
}
|
||||
|
||||
super.close();
|
||||
}
|
||||
|
||||
private static final DataOutputStream createDataOutputStream(final OutputStream stream) {
|
||||
return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -334,24 +334,44 @@ public class IntervalUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Splits an interval list into multiple files.
|
||||
* @param fileHeader The sam file header.
|
||||
* Splits an interval list into multiple sublists.
|
||||
* @param locs The genome locs to split.
|
||||
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
|
||||
* @param scatterParts The output interval lists to write to.
|
||||
* @return A list of lists of genome locs, split according to splits
|
||||
*/
|
||||
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
|
||||
if (splits.size() != scatterParts.size())
|
||||
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
|
||||
int fileIndex = 0;
|
||||
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||
int locIndex = 1;
|
||||
int start = 0;
|
||||
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||
for (Integer stop: splits) {
|
||||
IntervalList intervalList = new IntervalList(fileHeader);
|
||||
List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
|
||||
for (int i = start; i < stop; i++)
|
||||
intervalList.add(toInterval(locs.get(i), locIndex++));
|
||||
intervalList.write(scatterParts.get(fileIndex++));
|
||||
curList.add(locs.get(i));
|
||||
start = stop;
|
||||
sublists.add(curList);
|
||||
}
|
||||
|
||||
return sublists;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Splits an interval list into multiple files.
|
||||
* @param fileHeader The sam file header.
|
||||
* @param splits Pre-divided genome locs returned by splitFixedIntervals.
|
||||
* @param scatterParts The output interval lists to write to.
|
||||
*/
|
||||
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
|
||||
if (splits.size() != scatterParts.size())
|
||||
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
|
||||
|
||||
int fileIndex = 0;
|
||||
int locIndex = 1;
|
||||
for (final List<GenomeLoc> split : splits) {
|
||||
IntervalList intervalList = new IntervalList(fileHeader);
|
||||
for (final GenomeLoc loc : split)
|
||||
intervalList.add(toInterval(loc, locIndex++));
|
||||
intervalList.write(scatterParts.get(fileIndex++));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -361,17 +381,15 @@ public class IntervalUtils {
|
|||
* @param numParts Number of parts to split the locs into.
|
||||
* @return The stop points to split the genome locs.
|
||||
*/
|
||||
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
||||
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
||||
if (locs.size() < numParts)
|
||||
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
||||
long locsSize = 0;
|
||||
for (GenomeLoc loc: locs)
|
||||
locsSize += loc.size();
|
||||
List<Integer> splitPoints = new ArrayList<Integer>();
|
||||
final long locsSize = intervalSize(locs);
|
||||
final List<Integer> splitPoints = new ArrayList<Integer>();
|
||||
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
|
||||
Collections.sort(splitPoints);
|
||||
splitPoints.add(locs.size());
|
||||
return splitPoints;
|
||||
return splitIntervalsToSubLists(locs, splitPoints);
|
||||
}
|
||||
|
||||
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
||||
|
|
@ -441,4 +459,11 @@ public class IntervalUtils {
|
|||
return merged;
|
||||
}
|
||||
}
|
||||
|
||||
public static final long intervalSize(final List<GenomeLoc> locs) {
|
||||
long size = 0;
|
||||
for ( final GenomeLoc loc : locs )
|
||||
size += loc.size();
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -118,31 +118,40 @@ public class ReadUtils {
|
|||
/**
|
||||
* This enum represents all the different ways in which a read can overlap an interval.
|
||||
*
|
||||
* NO_OVERLAP:
|
||||
* NO_OVERLAP_CONTIG:
|
||||
* read and interval are in different contigs.
|
||||
*
|
||||
* NO_OVERLAP_LEFT:
|
||||
* the read does not overlap the interval.
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* NO_OVERLAP_RIGHT:
|
||||
* the read does not overlap the interval.
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* LEFT_OVERLAP:
|
||||
* OVERLAP_LEFT:
|
||||
* the read starts before the beginning of the interval but ends inside of it
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* RIGHT_OVERLAP:
|
||||
* OVERLAP_RIGHT:
|
||||
* the read starts inside the interval but ends outside of it
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* FULL_OVERLAP:
|
||||
* OVERLAP_LEFT_AND_RIGHT:
|
||||
* the read starts before the interval and ends after the interval
|
||||
*
|
||||
* |-----------| (interval)
|
||||
* <-------------------> (read)
|
||||
*
|
||||
* CONTAINED:
|
||||
* OVERLAP_CONTAINED:
|
||||
* the read starts and ends inside the interval
|
||||
*
|
||||
* |----------------| (interval)
|
||||
|
|
@ -658,7 +667,7 @@ public class ReadUtils {
|
|||
return ReadAndIntervalOverlap.OVERLAP_RIGHT;
|
||||
}
|
||||
|
||||
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
|
||||
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
|
||||
public static int getRefCoordSoftUnclippedStart(SAMRecord read) {
|
||||
int start = read.getUnclippedStart();
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
|
||||
|
|
@ -670,9 +679,13 @@ public class ReadUtils {
|
|||
return start;
|
||||
}
|
||||
|
||||
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
|
||||
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
|
||||
public static int getRefCoordSoftUnclippedEnd(SAMRecord read) {
|
||||
int stop = read.getUnclippedStart();
|
||||
|
||||
if (readIsEntirelyInsertion(read))
|
||||
return stop;
|
||||
|
||||
int shift = 0;
|
||||
CigarOperator lastOperator = null;
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
|
||||
|
|
@ -686,6 +699,14 @@ public class ReadUtils {
|
|||
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
|
||||
}
|
||||
|
||||
private static boolean readIsEntirelyInsertion(SAMRecord read) {
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
|
||||
if (cigarElement.getOperator() != CigarOperator.INSERTION)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before
|
||||
* the alignment start of the read.
|
||||
|
|
|
|||
|
|
@ -293,17 +293,8 @@ public class Genotype {
|
|||
return commonInfo.getAttribute(key, defaultValue);
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
|
||||
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
|
||||
public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
|
||||
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
|
||||
public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
|
||||
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
|
||||
public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
|
||||
|
||||
public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
|
||||
public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
|
||||
public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
|
||||
public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
|
|
@ -204,27 +206,40 @@ public final class InferredGeneticContext {
|
|||
return defaultValue;
|
||||
}
|
||||
|
||||
// public AttributedObject getAttributes(Collection<Object> keys) {
|
||||
// AttributedObject selected = new AttributedObject();
|
||||
//
|
||||
// for ( Object key : keys )
|
||||
// selected.putAttribute(key, this.getAttribute(key));
|
||||
//
|
||||
// return selected;
|
||||
// }
|
||||
public String getAttributeAsString(String key, String defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof String ) return (String)x;
|
||||
return String.valueOf(x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
|
||||
public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
|
||||
public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
|
||||
public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
|
||||
public int getAttributeAsInt(String key, int defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
|
||||
if ( x instanceof Integer ) return (Integer)x;
|
||||
return Integer.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); }
|
||||
public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); }
|
||||
public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); }
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); }
|
||||
public double getAttributeAsDouble(String key, double defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Double ) return (Double)x;
|
||||
return Double.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
|
||||
public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
|
||||
public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
|
||||
public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Boolean ) return (Boolean)x;
|
||||
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
|
||||
// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
|
||||
// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
|
||||
// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
|
||||
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
|
||||
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
|
||||
// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
|
||||
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
|
||||
}
|
||||
|
|
@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
return commonInfo.getAttribute(key, defaultValue);
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
|
||||
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
|
||||
public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
|
||||
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
|
||||
public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
|
||||
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
|
||||
public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
|
||||
|
||||
public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
|
||||
public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
|
||||
public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
|
||||
public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with alleles
|
||||
|
|
@ -817,6 +807,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param other VariantContext whose alternate alleles to compare against
|
||||
* @return true if this VariantContext has the same alternate alleles as other,
|
||||
* regardless of ordering. Otherwise returns false.
|
||||
*/
|
||||
public boolean hasSameAlternateAllelesAs ( VariantContext other ) {
|
||||
Set<Allele> thisAlternateAlleles = getAlternateAlleles();
|
||||
Set<Allele> otherAlternateAlleles = other.getAlternateAlleles();
|
||||
|
||||
if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for ( Allele allele : thisAlternateAlleles ) {
|
||||
if ( ! otherAlternateAlleles.contains(allele) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with genotypes
|
||||
|
|
@ -983,7 +995,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @return true if it's monomorphic
|
||||
*/
|
||||
public boolean isMonomorphic() {
|
||||
return ! isVariant() || getChromosomeCount(getReference()) == getChromosomeCount();
|
||||
return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -1085,14 +1097,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
}
|
||||
|
||||
public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
|
||||
// don't validate if we're an insertion or complex event
|
||||
if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) {
|
||||
throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
|
||||
// don't validate if we're a complex event
|
||||
if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) {
|
||||
throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
|
||||
}
|
||||
|
||||
// we also need to validate the padding base for simple indels
|
||||
if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) )
|
||||
throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), (char)getReferenceBaseForIndel().byteValue(), (char)paddedRefBase.byteValue()));
|
||||
if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) {
|
||||
throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue()));
|
||||
}
|
||||
}
|
||||
|
||||
public void validateRSIDs(Set<String> rsIDs) {
|
||||
|
|
|
|||
|
|
@ -565,11 +565,11 @@ public class VariantContextUtils {
|
|||
// special case DP (add it up) and ID (just preserve it)
|
||||
//
|
||||
if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
|
||||
depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY));
|
||||
depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
|
||||
if (rsID == null && vc.hasID())
|
||||
rsID = vc.getID();
|
||||
if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
|
||||
String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY);
|
||||
String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null);
|
||||
// lets see if the string contains a , separator
|
||||
if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) {
|
||||
List<String> alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
|
||||
|
|
@ -663,6 +663,18 @@ public class VariantContextUtils {
|
|||
return merged;
|
||||
}
|
||||
|
||||
public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) {
|
||||
// if all alleles of vc1 are a contained in alleles of vc2, return true
|
||||
if (!vc1.getReference().equals(vc2.getReference()))
|
||||
return false;
|
||||
|
||||
for (Allele a :vc1.getAlternateAlleles()) {
|
||||
if (!vc2.getAlternateAlleles().contains(a))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
|
||||
// see if we need to trim common reference base from all alleles
|
||||
boolean trimVC;
|
||||
|
|
@ -1147,9 +1159,7 @@ public class VariantContextUtils {
|
|||
for (String orAttrib : MERGE_OR_ATTRIBS) {
|
||||
boolean attribVal = false;
|
||||
for (VariantContext vc : vcList) {
|
||||
Boolean val = vc.getAttributeAsBooleanNoException(orAttrib);
|
||||
if (val != null)
|
||||
attribVal = (attribVal || val);
|
||||
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
|
||||
if (attribVal) // already true, so no reason to continue:
|
||||
break;
|
||||
}
|
||||
|
|
@ -1159,7 +1169,7 @@ public class VariantContextUtils {
|
|||
// Merge ID fields:
|
||||
String iDVal = null;
|
||||
for (VariantContext vc : vcList) {
|
||||
String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY);
|
||||
String val = vc.getAttributeAsString(VariantContext.ID_KEY, null);
|
||||
if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) {
|
||||
if (iDVal == null)
|
||||
iDVal = val;
|
||||
|
|
@ -1239,8 +1249,10 @@ public class VariantContextUtils {
|
|||
|
||||
public PhaseAndQuality(Genotype gt) {
|
||||
this.isPhased = gt.isPhased();
|
||||
if (this.isPhased)
|
||||
this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
|
||||
if (this.isPhased) {
|
||||
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
|
||||
if ( this.PQ == -1 ) this.PQ = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest {
|
|||
Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec());
|
||||
Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath());
|
||||
|
||||
if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) {
|
||||
if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) {
|
||||
Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n",
|
||||
indexFromOutputFile.getProperties(),
|
||||
dynamicIndex.getProperties()));
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -45,7 +44,6 @@ import org.testng.annotations.Test;
|
|||
|
||||
import java.io.*;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
|
|||
try {
|
||||
Index idx = builder.loadIndex(vcfFile, new VCFCodec());
|
||||
// catch any exception; this call should pass correctly
|
||||
SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx);
|
||||
SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Assert.fail("IO exception unexpected" + e.getMessage());
|
||||
|
|
|
|||
|
|
@ -127,6 +127,7 @@ public class TraverseReadsUnitTest extends BaseTest {
|
|||
Object accumulator = countReadWalker.reduceInit();
|
||||
|
||||
while (shardStrategy.hasNext()) {
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
Shard shard = shardStrategy.next();
|
||||
|
||||
if (shard == null) {
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue