Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Matt Hanna 2011-09-27 11:07:01 -04:00
commit e5ce5e265a
203 changed files with 5112 additions and 4149 deletions

299
build.xml
View File

@ -163,6 +163,14 @@
<!-- Remove old versions of ivy jars AFTER the ivy:retrieve has been class loaded. -->
<delete file="${ivy.jar.dir}/ivy-2.0.0.jar"/>
<delete file="${ivy.jar.dir}/ivy-2.2.0-rc1.jar"/>
<!--
An old versions of the ivy-1.4.1.xml does not contain /ivy-module/configuration/conf/@name="compile".
Easier to upgrade to 1.4.4 than try to deal with xmlproperty and conditional deletion in ant.
Just in case we remove explicit 1.4.4 and go back to 1.4.1, try to clean out the file for now.
-->
<delete file="${ivy.home}/cache/javax.mail/mail/ivy-1.4.1.xml"/>
<delete file="${ivy.home}/cache/javax.mail/mail/ivydata-1.4.1.properties"/>
<delete file="${ivy.home}/cache/javax.mail/mail/jars/mail-1.4.1.jar"/>
</target>
<target name="init.buildall">
@ -709,53 +717,6 @@
</antcall>
</target>
<target name="test.init.compile">
<mkdir dir="${java.test.classes}"/>
<mkdir dir="${scala.test.classes}"/>
<antcall target="resolve">
<param name="ivy.conf" value="test"/>
</antcall>
</target>
<target name="test.java.compile" depends="init.buildall,dist,test.init.compile">
<echo message="Sting: Compiling test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
<src path="${java.public.test.sources}"/>
<src path="${java.private.test.sources}"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
<target name="test.scala.compile" depends="test.java.compile,scala.compile" if="scala.include">
<echo message="Scala: Compiling test cases!"/>
<antcall target="resolve">
<param name="ivy.conf" value="test"/>
</antcall>
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
<src path="${scala.public.test.sources}" />
<src path="${scala.private.test.sources}" />
<include name="**/*.scala"/>
<classpath>
<path refid="scala.dependencies"/>
<pathelement location="${scala.test.classes}"/>
<pathelement location="${java.test.classes}"/>
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
</classpath>
</scalac>
</target>
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
<!-- new scala target -->
<target name="scala" description="build the scala directory">
@ -769,20 +730,113 @@
<!-- ***************************************************************************** -->
<!-- where to put reports and tests-->
<property name="report" value="${build.dir}/report"/>
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
<property name="test.output" value="${dist.dir}/test"/>
<property name="java.public.test.sources" value="public/java/test"/>
<property name="java.private.test.sources" value="private/java/test"/>
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
<property name="java.public.test.classes" value="${java.test.classes}/public"/>
<property name="java.private.test.classes" value="${java.test.classes}/private"/>
<property name="java.public.test.sources" value="${public.dir}/java/test"/>
<property name="java.private.test.sources" value="${private.dir}/java/test"/>
<property name="scala.test.classes" value="${build.dir}/scala/testclasses"/>
<property name="scala.public.test.sources" value="public/scala/test"/>
<property name="scala.private.test.sources" value="private/scala/test"/>
<property name="scala.public.test.classes" value="${scala.test.classes}/public"/>
<property name="scala.private.test.classes" value="${scala.test.classes}/private"/>
<property name="scala.public.test.sources" value="${public.dir}/scala/test"/>
<property name="scala.private.test.sources" value="${private.dir}/scala/test"/>
<property name="testng.jar" value="${lib.dir}/testng-5.14.1.jar"/>
<!-- provide a ceiling on the memory that unit/integration tests can consume. -->
<property name="test.maxmemory" value="4g"/>
<target name="test.init.compile">
<mkdir dir="${java.test.classes}"/>
<mkdir dir="${scala.test.classes}"/>
<antcall target="resolve">
<param name="ivy.conf" value="test"/>
</antcall>
</target>
<target name="test.java.public.compile" depends="dist,test.init.compile">
<mkdir dir="${java.public.test.classes}"/>
<echo message="Sting: Compiling public test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.public.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
<src path="${java.public.test.sources}"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
<target name="test.java.private.compile" depends="dist,test.init.compile,test.java.public.compile" if="include.private">
<mkdir dir="${java.private.test.classes}"/>
<echo message="Sting: Compiling private test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.private.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
<src path="${java.private.test.sources}"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile"/>
<target name="test.scala.public.compile" depends="test.java.compile,scala.compile" if="scala.include">
<mkdir dir="${scala.public.test.classes}"/>
<echo message="Scala: Compiling public test cases!"/>
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.public.test.classes}" deprecation="yes" unchecked="yes">
<src path="${scala.public.test.sources}" />
<classpath>
<path refid="scala.dependencies"/>
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${testng.jar}"/>
</classpath>
</scalac>
</target>
<target name="test.scala.private.compile" depends="test.java.compile,scala.compile,test.scala.public.compile" if="include.scala.private">
<mkdir dir="${scala.private.test.classes}"/>
<echo message="Scala: Compiling private test cases!"/>
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.private.test.classes}" deprecation="yes" unchecked="yes">
<src path="${scala.private.test.sources}" />
<classpath>
<path refid="scala.dependencies"/>
<pathelement location="${scala.public.test.classes}"/>
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${java.private.test.classes}"/>
<pathelement location="${testng.jar}"/>
</classpath>
</scalac>
</target>
<target name="test.scala.compile" depends="test.scala.public.compile,test.scala.private.compile"/>
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
<!-- TEST -->
<macrodef name="run-test">
<attribute name="testtype"/>
<attribute name="outputdir"/>
<attribute name="runfailed"/>
<sequential>
<condition property="run.failed.tests">
<equals arg1="@{runfailed}" arg2="true"/>
</condition>
<!-- Get the pipeline run type. Default to dry. -->
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
@ -792,10 +846,10 @@
<isset property="include.contracts" />
</condition>
<mkdir dir="${report}/@{testtype}"/>
<mkdir dir="@{outputdir}"/>
<echo message="Sting: Running @{testtype} test cases!"/>
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
<testng outputDir="${report}/@{testtype}"
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
<testng outputDir="@{outputdir}"
haltOnFailure="false" failureProperty="test.failure"
verbose="2"
workingDir="${basedir}"
@ -813,117 +867,108 @@
<pathelement location="${java.classes}" />
<pathelement location="${scala.classes}" />
<pathelement location="${java.contracts}" />
<pathelement location="${java.test.classes}" />
<pathelement location="${scala.test.classes}" />
<pathelement location="${java.public.test.classes}" />
<pathelement location="${java.private.test.classes}" />
<pathelement location="${scala.public.test.classes}" />
<pathelement location="${scala.private.test.classes}" />
</classpath>
<classfileset dir="${java.test.classes}" includes="**/@{testtype}.class"/>
<classfileset dir="${scala.test.classes}" includes="**/@{testtype}*.class" />
<classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}.class" if="include.private"/>
</classfileset>
<classfileset dir="${scala.public.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}*.class" if="scala.include"/>
</classfileset>
<classfileset dir="${scala.private.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}*.class" if="include.scala.private"/>
</classfileset>
<xmlfileset dir="${basedir}">
<include name="@{testtype}" if="run.failed.tests"/>
</xmlfileset>
</testng>
<!-- generate a report for Bamboo or Hudson to read in -->
<junitreport todir="${report}/@{testtype}">
<fileset dir="${report}/@{testtype}">
<junitreport todir="@{outputdir}">
<fileset dir="@{outputdir}">
<include name="*/*.xml"/>
</fileset>
<report format="noframes" todir="${report}/@{testtype}"/>
<report format="noframes" todir="@{outputdir}"/>
</junitreport>
<fail message="test failed" if="test.failure" />
</sequential>
</macrodef>
<!-- FAILED-TEST -->
<macrodef name="run-failed-test">
<attribute name="xmlfailedtestfile" />
<sequential>
<!-- Get the pipeline run type. Default to dry. -->
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
</condition>
<condition property="cofoja.jvm.args" value="-javaagent:${cofoja.jar} -Dcom.google.java.contract.log.contract=false" else="">
<isset property="include.contracts" />
</condition>
<mkdir dir="${report}/failed_rerun" />
<echo message="Sting: Running @{xmlfailedtestfile} test cases!"/>
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
<testng outputDir="${report}/failed_rerun"
haltOnFailure="false" failureProperty="test.failure"
verbose="2"
workingDir="${basedir}"
useDefaultListeners="false"
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter">
<jvmarg value="-Xmx${test.maxmemory}" />
<jvmarg value="-Djava.awt.headless=true" />
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
<jvmarg line="${cofoja.jvm.args}"/>
<!-- <jvmarg value="-Xdebug"/> -->
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.classes}" />
<pathelement location="${scala.classes}" />
<pathelement location="${java.contracts}" />
<pathelement location="${java.test.classes}" />
<pathelement location="${scala.test.classes}" />
</classpath>
<xmlfileset dir="${basedir}" includes="@{xmlfailedtestfile}" />
</testng>
<fail message="test failed" if="test.failure" />
</sequential>
</macrodef>
<!-- our three different test conditions: Test, IntegrationTest, PerformanceTest -->
<target name="test" depends="test.compile" description="Run unit tests">
<target name="alltests">
<antcall target="test" inheritAll="false"/>
<antcall target="integrationtest" inheritAll="false"/>
<antcall target="pipelinetest" inheritAll="false"/>
</target>
<target name="alltests.public">
<antcall target="test.public" inheritAll="false"/>
<antcall target="integrationtest.public" inheritAll="false"/>
<antcall target="pipelinetest.public" inheritAll="false"/>
</target>
<!-- Our four different test conditions: Test, IntegrationTest, PerformanceTest, PipelineTest -->
<target name="test" depends="init.buildall,test.compile" description="Run unit tests">
<condition property="ttype" value="*UnitTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${ttype}"/>
<run-test testtype="${ttype}" outputdir="${report}/${ttype}" runfailed="false"/>
</target>
<target name="integrationtest" depends="test.compile" description="Run integration tests">
<target name="test.public" depends="init.buildpublic,test"/>
<target name="integrationtest" depends="init.buildall,test.compile" description="Run integration tests">
<condition property="itype" value="*IntegrationTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${itype}"/>
<run-test testtype="${itype}" outputdir="${report}/${itype}" runfailed="false"/>
</target>
<target name="performancetest" depends="test.compile" description="Run performance tests">
<target name="integrationtest.public" depends="init.buildpublic,integrationtest"/>
<target name="performancetest" depends="init.buildall,test.compile" description="Run performance tests">
<condition property="ptype" value="*PerformanceTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${ptype}"/>
<run-test testtype="${ptype}" outputdir="${report}/${ptype}" runfailed="false"/>
</target>
<target name="pipelinetest" depends="test.compile" description="Run pipeline tests">
<target name="performancetest.public" depends="init.buildpublic,performancetest" />
<target name="pipelinetest" depends="init.buildall,test.compile" description="Run pipeline tests">
<condition property="pipetype" value="*PipelineTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${pipetype}"/>
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
</target>
<target name="pipelinetestrun" depends="test.compile" description="Run pipeline tests">
<target name="pipelinetest.public" depends="init.buildpublic,pipelinetest" />
<target name="pipelinetestrun" depends="init.buildall,test.compile" description="Run pipeline tests">
<property name="pipeline.run" value="run"/>
<condition property="pipetype" value="*PipelineTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${pipetype}"/>
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
</target>
<target name="pipelinetestrun.public" depends="init.buildpublic,pipelinetestrun" />
<target name="failed-test" depends="init.buildall,test.compile">
<run-test testtype="${report}/*UnitTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<target name="failed-test" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*UnitTest/testng-failed.xml" />
<target name="failed-integration" depends="init.buildall,test.compile">
<run-test testtype="${report}/*IntegrationTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<target name="failed-integration" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*IntegrationTest/testng-failed.xml" />
<target name="failed-performance" depends="init.buildall,test.compile">
<run-test testtype="${report}/*PerformanceTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<target name="failed-performance" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*PerformanceTest/testng-failed.xml" />
</target>
<target name="failed-pipeline" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*PipelineTest/testng-failed.xml" />
<target name="failed-pipeline" depends="init.buildall,test.compile">
<run-test testtype="${report}/*PipelineTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<!-- ******************************************************************************** -->

View File

@ -15,10 +15,8 @@
<!-- Tribble -->
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
<dependency org="log4j" name="log4j" rev="1.2.15">
<!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
<exclude org="javax.mail" />
</dependency>
<dependency org="log4j" name="log4j" rev="1.2.15"/>
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
<dependency org="colt" name="colt" rev="1.2.0"/>
<dependency org="jboss" name="javassist" rev="3.7.ga"/>
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>

View File

@ -12,14 +12,14 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
}
RUNTIME_UNITS = "(sec)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000
RUNTIME_UNITS = "(hours)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
#
# Helper function to aggregate all of the jobs in the report across all tables
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall) {
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
}
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
maxRelTime = max(allJobs$relDoneTime)
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + theme_bw()
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
if ( includeText )
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
p <- p + ylab("Job")
p <- p + ylab("Job number")
p <- p + opts(title=title)
print(p)
}
@ -140,6 +142,8 @@ print(paste("Project :", inputFileName))
convertUnits <- function(gatkReportData) {
convertGroup <- function(g) {
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
g
}
lapply(gatkReportData, convertGroup)
@ -155,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T)
plotJobsGantt(gatkReportData, F)
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotProgressByTime(gatkReportData)
for ( group in gatkReportData ) {
plotGroup(group)

View File

@ -114,7 +114,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
private String OUTPUT_DIR = "analyzeCovariates/";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.12.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "public/R/";

View File

@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
if ( tribbleType == null )
if ( ! file.canRead() | !! file.isFile() ) {
if ( ! file.canRead() | ! file.isFile() ) {
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
} else {
throw new UserException.CommandLineException(

View File

@ -929,6 +929,14 @@ public class GenomeAnalysisEngine {
return readsDataSource.getHeader(reader);
}
/**
* Gets the master sequence dictionary for this GATK engine instance
* @return a never-null dictionary listing all of the contigs known to this engine instance
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return getReferenceDataSource().getReference().getSequenceDictionary();
}
/**
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads; header merger, individual file readers etc can be accessed through the returned data source object.

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
currentLocus = null;
continue;
}

View File

@ -215,6 +215,45 @@ public class GATKBAMIndex {
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
public long getStartOfLastLinearBin() {
openIndexFile();
seek(4);
final int sequenceCount = readInteger();
// Because no reads may align to the last sequence in the sequence dictionary,
// grab the last element of the linear index for each sequence, and return
// the last one from the last sequence that has one.
long lastLinearIndexPointer = -1;
for (int i = 0; i < sequenceCount; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j1 = 0; j1 < nBins; j1++) {
// Skip bin #
skipBytes(4);
final int nChunks = readInteger();
// Skip chunks
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
if (nLinearBins > 0) {
// Skip to last element of list of linear bins
skipBytes(8 * (nLinearBins - 1));
lastLinearIndexPointer = readLongs(1)[0];
}
}
closeIndexFile();
return lastLinearIndexPointer;
}
/**
* Gets the possible number of bins for a given reference sequence.
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.

View File

@ -59,7 +59,7 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
*/
public FilePointer next() {
FilePointer current = wrappedIterator.next();
while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
current = current.combine(parser,wrappedIterator.next());
return current;
}

View File

@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
// If the region contains location information (in other words, it is not at
// the start of the unmapped region), add the region.
if(currentFilePointer.isRegionUnmapped) {
// If the region is unmapped and no location data exists, add a null as an indicator to
// start at the next unmapped region.
if(!isIntoUnmappedRegion) {
selectedReaders.put(id,null);
isIntoUnmappedRegion = true;
}
else
selectedReaders.put(id,position.get(id));
}
else {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.examples;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -59,6 +60,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
* @author Your Name
* @since Date created
*/
@Hidden
public class GATKDocsExample extends RodWalker<Integer, Integer> {
/**
* Put detailed documentation about the argument here. No need to duplicate the summary information

View File

@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
* @version 0.1
*/
public class PlatformFilter extends ReadFilter {
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false)
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false)
protected String[] PLFilterNames;
public boolean filterOut(SAMRecord rec) {

View File

@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
else if ( stub.getOutputStream() != null ) {
this.file = null;
this.stream = stub.getOutputStream();
writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
}
else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
}
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.io.stubs;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
return isCompressed;
}
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return engine.getMasterSequenceDictionary();
}
/**
* Should we tell the VCF writer not to write genotypes?
* @return true if the writer should not write genotypes.

View File

@ -293,15 +293,16 @@ public class GATKRunReport {
* That is, postReport() is guarenteed not to fail for any reason.
*/
private File postReportToLocalDisk(File rootDir) {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
try {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
postReportToFile(file);
logger.debug("Wrote report to " + file);
return file;
} catch ( Exception e ) {
// we catch everything, and no matter what eat the error
exceptDuringRunReport("Couldn't read report file", e);
file.delete();
return null;
}
}
@ -312,6 +313,7 @@ public class GATKRunReport {
File localFile = postReportToLocalDisk(new File("./"));
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
if ( localFile != null ) { // we succeeded in creating the local file
localFile.deleteOnExit();
try {
// stop us from printing the annoying, and meaningless, mime types warning
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
@ -336,14 +338,13 @@ public class GATKRunReport {
//logger.info("Uploading " + localFile + " to AWS bucket");
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
logger.debug("Uploaded to AWS: " + s3Object);
logger.info("Uploaded run statistics report to AWS S3");
} catch ( S3ServiceException e ) {
exceptDuringRunReport("S3 exception occurred", e);
} catch ( NoSuchAlgorithmException e ) {
exceptDuringRunReport("Couldn't calculate MD5", e);
} catch ( IOException e ) {
exceptDuringRunReport("Couldn't read report file", e);
} finally {
localFile.delete();
}
}
}

View File

@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram {
Index index = IndexFactory.createIndex(inputFileSource, codec, approach);
// add writing of the sequence dictionary, if supplied
builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false);
builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary());
// create the output stream, and write the index
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.index.Index;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* Utilities for working with Sequence Dictionaries embedded in tribble indices
*
* @author Your Name
* @since Date created
*/
public class IndexDictionaryUtils {
private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class);
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
for ( SAMSequenceRecord seq : dict.getSequences() ) {
final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName();
final String length = String.valueOf(seq.getSequenceLength());
index.addProperty(contig,length);
}
}
public static void validateTrackSequenceDictionary(final String trackName,
final SAMSequenceDictionary trackDict,
final SAMSequenceDictionary referenceDict,
final ValidationExclusion.TYPE validationExclusionType ) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0)
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else {
Set<String> trackSequences = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
trackSequences.add(dictionaryEntry.getSequenceName());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
}
}
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureSource;
@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
*
* @author aaron
*
* @author aaron
* `
* Class RMDTrackBuilder
*
@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false;
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
// private sequence dictionary we use to set our tracks with
private SAMSequenceDictionary dict = null;
@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); }
catch (ReviewedStingException e) { }
sequenceDictionary = getSequenceDictionaryFromProperties(index);
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (sequenceDictionary.size() == 0 && dict != null) {
File indexFile = Tribble.indexFile(inputFile);
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
sequenceDictionary = getSequenceDictionaryFromProperties(index);
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
try { // re-write the index
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
}
if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE )
@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
// this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile);
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict);
return idx;
}
// ---------------------------------------------------------------------------------------------------------
// static functions to work with the sequence dictionaries of indexes
// ---------------------------------------------------------------------------------------------------------
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
/**
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
* (that each contig in the index is in the sequence dictionary).
* @param inputFile for proper error message formatting.
* @param dict the sequence dictionary
* @param index the index file
* @param indexFile the index file
* @param rewriteIndex should we rewrite the index when we're done?
*
*/
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
if (dict == null) return;
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) {
if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null");
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
for (SAMSequenceRecord seq : currentDict.getSequences()) {
if (dict.getSequence(seq.getSequenceName()) == null)
continue;
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
}
// re-write the index
if (rewriteIndex) try {
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict);
// actually update the dictionary in the index
IndexDictionaryUtils.setIndexSequenceDictionary(index, dict);
}
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0)
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else {
Set<String> trackSequences = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
trackSequences.add(dictionaryEntry.getSequenceName());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
}
public void validateTrackSequenceDictionary(final String trackName,
final SAMSequenceDictionary trackDict,
final SAMSequenceDictionary referenceDict ) {
IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType);
}
}

View File

@ -24,12 +24,14 @@
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.*;
/**
* Tracks a linked list of GATKReportColumn in order by name.
*/
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> {
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
private List<String> columnNames = new ArrayList<String>();
/**
@ -52,4 +54,14 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> {
columnNames.add(key);
return super.put(key, value);
}
@Override
public Iterator<GATKReportColumn> iterator() {
return new Iterator<GATKReportColumn>() {
int offset = 0;
public boolean hasNext() { return offset < columnNames.size() ; }
public GATKReportColumn next() { return getByIndex(offset++); }
public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); }
};
}
}

View File

@ -286,6 +286,10 @@ public class GATKReportTable {
}
}
public boolean containsKey(Object primaryKey) {
return primaryKeyColumn.contains(primaryKey);
}
/**
* Set the value for a given position in the table
*

View File

@ -358,7 +358,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
public void printOnTraversalDone() {
printProgress(null, null, true);
final double elapsed = timer.getElapsedTime();
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics();

View File

@ -26,21 +26,23 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import java.io.PrintStream;
import java.util.Iterator;
/**
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
* of the given ROD.
*/
public class PrintRODsWalker extends RodWalker<Integer, Integer> {
@Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true)
public RodBinding<Feature> input;
@Output
PrintStream out;
@ -62,7 +64,7 @@ public class PrintRODsWalker extends RodWalker<Integer, Integer> {
if ( tracker == null )
return 0;
for ( Feature feature : tracker.getValues(Feature.class) ) {
for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) {
out.println(feature.toString());
}

View File

@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
* -I input1.bam \
* -I input2.bam \
* --read_filter MappingQualityZero
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T PrintReads \
* -o output.bam \
* -I input.bam \
* -n 2000
* </pre>
*
*/

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -77,6 +78,15 @@ public abstract class Walker<MapType, ReduceType> {
return toolkit;
}
/**
* Gets the master sequence dictionary for this walker
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
*/
protected SAMSequenceDictionary getMasterSequenceDictionary() {
return getToolkit().getMasterSequenceDictionary();
}
/**
* (conceptual static) method that states whether you want to see reads piling up at a locus
* that contain a deletion at the locus.

View File

@ -43,6 +43,9 @@ import java.util.List;
import java.util.Map;
/**
* The allele balance (fraction of ref bases over ref + alt bases) across all bialleleic het-called samples
*/
public class AlleleBalance extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -16,6 +16,9 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* The allele balance (fraction of ref bases over ref + alt bases) separately for each bialleleic het-called sample
*/
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {

View File

@ -6,8 +6,9 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.util.Map;
/**
* Abstract base class for all annotations that are normalized by depth
*/
public abstract class AnnotationByDepth extends InfoFieldAnnotation {

View File

@ -47,6 +47,9 @@ import java.util.List;
import java.util.Map;
/**
* Count of A, C, G, T bases across all samples
*/
public class BaseCounts extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -13,6 +13,9 @@ import java.util.LinkedHashMap;
import java.util.List;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele)
*/
public class BaseQualityRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }

View File

@ -44,6 +44,11 @@ import java.util.List;
import java.util.Map;
/**
* Allele count in genotypes, for each ALT allele, in the same order as listed;
* allele Frequency, for each ALT allele, in the same order as listed; total number
* of alleles in called genotypes.
*/
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation {
private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY };

View File

@ -16,7 +16,23 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Total (unfiltered) depth over all samples.
*
* This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
* normally be excluded from the statistical calculations going into GQ and QUAL.
*
* Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
* -dcov D is N * D
*/
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -23,6 +23,25 @@ import java.util.List;
import java.util.Map;
/**
* The depth of coverage of each VCF allele in this sample.
*
* This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
* the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
* are actually present and correctly left-aligned in the alignments themselves). Because of this fact and
* because the AD includes reads and bases that were filtered by the Unified Genotyper, <b>one should not base
* assumptions about the underlying genotype based on it</b>; instead, the genotype likelihoods (PLs) are what
* determine the genotype calls (see below).
*/
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
private static String REF_ALLELE = "REF";

View File

@ -43,6 +43,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
* being seen on only the forward or only the reverse strand) in the reads? More bias is
* indicative of false positive calls.
*/
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation {
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;

View File

@ -17,6 +17,9 @@ import java.util.List;
import java.util.Map;
/**
* The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site
*/
public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -34,12 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
@ -49,6 +49,10 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
*/
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation {
private final static boolean DEBUG = false;
private final static int MIN_CONTEXT_WING_SIZE = 10;

View File

@ -19,6 +19,9 @@ import java.util.List;
import java.util.Map;
/**
* Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium
*/
public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation {
private static final int MIN_SAMPLES = 10;

View File

@ -16,7 +16,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Largest contiguous homopolymer run of the variant allele in either direction on the reference.
*/
public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnotation {
private boolean ANNOTATE_INDELS = true;

View File

@ -17,14 +17,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 5/16/11
*/
// A set of annotations calculated directly from the GLs
public class GLstats extends InfoFieldAnnotation implements StandardAnnotation {
/**
* Likelihood-based (using PL field) test for the inbreeding among samples.
*
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
* more information.
*/
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation {
private static final int MIN_SAMPLES = 10;

View File

@ -14,11 +14,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: Mar 11, 2011
* Time: 11:47:33 AM
* To change this template use File | Settings | File Templates.
* Rough category of indel type (insertion, deletion, multi-allelic, other)
*/
public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation {

View File

@ -17,6 +17,9 @@ import java.util.List;
import java.util.Map;
/**
* Triplet annotation: fraction of MAQP == 0, MAPQ < 10, and count of all mapped reads
*/
public class LowMQ extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -14,6 +14,9 @@ import java.util.LinkedHashMap;
import java.util.List;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
*/
public class MappingQualityRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }

View File

@ -19,6 +19,9 @@ import java.util.List;
import java.util.Map;
/**
* Total count across all samples of mapping quality zero reads
*/
public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -1,85 +1,81 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Feb 4, 2011
* Time: 6:46:25 PM
* To change this template use File | Settings | File Templates.
*/
public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
int mq0 = 0;
ReadBackedPileup pileup = null;
if (vc.isIndel() && context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
else return null;
if (pileup != null) {
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0));
return map;
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(
new VCFFormatHeaderLine(getKeyNames().get(0), 1,
VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); }
}
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Count for each sample of mapping quality zero reads
*/
public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
int mq0 = 0;
ReadBackedPileup pileup = null;
if (vc.isIndel() && context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
else return null;
if (pileup != null) {
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0));
return map;
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(
new VCFFormatHeaderLine(getKeyNames().get(0), 1,
VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); }
}

View File

@ -17,8 +17,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Fraction of all reads across samples that have mapping quality zero
*/
public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -17,11 +17,8 @@ import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 5/16/11
* The number of N bases, counting only SOLiD data
*/
public class NBaseCount extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if( stratifiedContexts.size() == 0 )

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -15,7 +16,11 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Variant confidence (given as (AB+BB)/AA from the PLs) / unfiltered depth.
*
* Low scores are indicative of false positive calls and artifacts.
*/
public class QualByDepth extends AnnotationByDepth implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -21,6 +21,9 @@ import java.util.List;
import java.util.Map;
/**
* Root Mean Square of the mapping quality of the reads across all samples.
*/
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -21,7 +21,9 @@ import java.util.List;
import java.util.Map;
/**
* Abstract root for all RankSum based annotations
*/
public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation {
static final double INDEL_LIKELIHOOD_THRESH = 0.1;
static final boolean DEBUG = false;

View File

@ -1,209 +1,207 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Feb 4, 2011
* Time: 3:59:27 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
private static String REF_ALLELE = "REF";
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() ) return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlternateAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
int mq0 = 0; // number of "ref" reads that are acually mq0
for ( PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
// we need to add counts in the correct order
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
}
map.put(getKeyNames().get(1), fracs);
return map;
}
private Map<String,Object> annotateIndel(AlignmentContext
stratifiedContext, VariantContext
vc) {
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
return null;
}
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
if ( pileup == null )
return null;
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map;
int mq0 = 0; // number of "ref" reads that are acually mq0
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
Allele refAllele = vc.getReference();
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( allele.isNoCall() ) {
continue; // this does not look so good, should we die???
}
alleleCounts.put(getAlleleRepresentation(allele), 0);
}
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
if ( e.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( e.isInsertion() ) {
final String b = e.getEventBases();
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
} else {
if ( e.isDeletion() ) {
if ( e.getEventLength() == refAllele.length() ) {
// this is indeed the deletion allele recorded in VC
final String b = DEL;
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
}
// else {
// System.out.print(" deletion of WRONG length found");
// }
}
}
}
if ( mq0 == totalDepth ) return map;
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
fracs[i] = String.format("%.3f",
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
map.put(getKeyNames().get(1), fracs);
//map.put(getKeyNames().get(0), counts);
return map;
}
private String getAlleleRepresentation(Allele allele) {
if ( allele.isNull() ) { // deletion wrt the ref
return DEL;
} else { // insertion, pass actual bases
return allele.getBaseString();
}
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
1,
VCFHeaderLineType.Integer,
"Total read depth per sample, including MQ0"),
new VCFFormatHeaderLine(getKeyNames().get(1),
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.Float,
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
}
}
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Unsupported
*/
@Hidden
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
private static String REF_ALLELE = "REF";
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() ) return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlternateAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
int mq0 = 0; // number of "ref" reads that are acually mq0
for ( PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
// we need to add counts in the correct order
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
}
map.put(getKeyNames().get(1), fracs);
return map;
}
private Map<String,Object> annotateIndel(AlignmentContext
stratifiedContext, VariantContext
vc) {
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
return null;
}
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
if ( pileup == null )
return null;
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map;
int mq0 = 0; // number of "ref" reads that are acually mq0
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
Allele refAllele = vc.getReference();
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( allele.isNoCall() ) {
continue; // this does not look so good, should we die???
}
alleleCounts.put(getAlleleRepresentation(allele), 0);
}
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
if ( e.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( e.isInsertion() ) {
final String b = e.getEventBases();
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
} else {
if ( e.isDeletion() ) {
if ( e.getEventLength() == refAllele.length() ) {
// this is indeed the deletion allele recorded in VC
final String b = DEL;
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
}
// else {
// System.out.print(" deletion of WRONG length found");
// }
}
}
}
if ( mq0 == totalDepth ) return map;
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
fracs[i] = String.format("%.3f",
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
map.put(getKeyNames().get(1), fracs);
//map.put(getKeyNames().get(0), counts);
return map;
}
private String getAlleleRepresentation(Allele allele) {
if ( allele.isNull() ) { // deletion wrt the ref
return DEL;
} else { // insertion, pass actual bases
return allele.getBaseString();
}
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
1,
VCFHeaderLineType.Integer,
"Total read depth per sample, including MQ0"),
new VCFFormatHeaderLine(getKeyNames().get(1),
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.Float,
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
}
}

View File

@ -19,11 +19,8 @@ import java.util.LinkedHashMap;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/30/11
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
*/
public class ReadPosRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("ReadPosRankSum"); }

View File

@ -15,8 +15,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* SB annotation value by depth of alt containing samples
*/
public class SBByDepth extends AnnotationByDepth {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
@ -26,7 +27,7 @@ public class SBByDepth extends AnnotationByDepth {
if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY))
return null;
double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY));
double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1);
final Map<String, Genotype> genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() == 0 )

View File

@ -41,7 +41,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* List all of the samples in the info field
*/
public class SampleList extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -24,7 +24,9 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -46,134 +45,522 @@ import java.util.*;
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
* output file (which must be provided on the command line via --snpEffFile:SnpEff <filename>),
* output file (which must be provided on the command line via --snpEffFile filename.vcf),
* and adds annotations on that effect.
*
* The possible biological effects and their associated impacts are defined in the class:
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
*
* @author David Roazen
*/
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
// SnpEff annotation key names:
public static final String GENE_ID_KEY = "GENE_ID";
public static final String GENE_NAME_KEY = "GENE_NAME";
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
public static final String EXON_ID_KEY = "EXON_ID";
public static final String EXON_RANK_KEY = "EXON_RANK";
public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
public static final String EFFECT_KEY = "EFFECT";
public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
public static final String CODON_NUM_KEY = "CODON_NUM";
public static final String CDS_SIZE_KEY = "CDS_SIZE";
private static Logger logger = Logger.getLogger(SnpEff.class);
// We refuse to parse SnpEff output files generated by unsupported versions, or
// lacking a SnpEff version number in the VCF header:
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
// When we write the SnpEff version number and command line to the output VCF, we change
// the key name slightly so that the output VCF won't be confused in the future for an
// output file produced by SnpEff directly:
public static final String OUTPUT_VCF_HEADER_VERSION_LINE_KEY = "Original" + SNPEFF_VCF_HEADER_VERSION_LINE_KEY;
public static final String OUTPUT_VCF_HEADER_COMMAND_LINE_KEY = "Original" + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY;
// SnpEff aggregates all effects (and effect metadata) together into a single INFO
// field annotation with the key EFF:
public static final String SNPEFF_INFO_FIELD_KEY = "EFF";
public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]";
public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|";
// Key names for the INFO field annotations we will add to each record, along
// with parsing-related information:
public enum InfoFieldKey {
EFFECT_KEY ("SNPEFF_EFFECT", -1),
IMPACT_KEY ("SNPEFF_IMPACT", 0),
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
EXON_ID_KEY ("SNPEFF_EXON_ID", 7),
FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1);
// Actual text of the key
private final String keyName;
// Index within the effect metadata subfields from the SnpEff EFF annotation
// where each key's associated value can be found during parsing.
private final int fieldIndex;
InfoFieldKey ( String keyName, int fieldIndex ) {
this.keyName = keyName;
this.fieldIndex = fieldIndex;
}
public String getKeyName() {
return keyName;
}
public int getFieldIndex() {
return fieldIndex;
}
}
// Possible SnpEff biological effects. All effect names found in the SnpEff input file
// are validated against this list.
public enum EffectType {
// High-impact effects:
FRAME_SHIFT (EffectFunctionalClass.NONE, false),
STOP_GAINED (EffectFunctionalClass.NONSENSE, false),
START_LOST (EffectFunctionalClass.NONE, false),
SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false),
SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false),
EXON_DELETED (EffectFunctionalClass.NONE, false),
STOP_LOST (EffectFunctionalClass.NONE, false),
// Moderate-impact effects:
NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false),
CODON_CHANGE (EffectFunctionalClass.NONE, false),
CODON_INSERTION (EffectFunctionalClass.NONE, false),
CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false),
CODON_DELETION (EffectFunctionalClass.NONE, false),
CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false),
UTR_5_DELETED (EffectFunctionalClass.NONE, false),
UTR_3_DELETED (EffectFunctionalClass.NONE, false),
// Low-impact effects:
SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false),
SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
START_GAINED (EffectFunctionalClass.NONE, false),
// Modifiers:
NONE (EffectFunctionalClass.NONE, true),
CHROMOSOME (EffectFunctionalClass.NONE, true),
INTERGENIC (EffectFunctionalClass.NONE, true),
UPSTREAM (EffectFunctionalClass.NONE, true),
UTR_5_PRIME (EffectFunctionalClass.NONE, true),
CDS (EffectFunctionalClass.NONE, true),
GENE (EffectFunctionalClass.NONE, true),
TRANSCRIPT (EffectFunctionalClass.NONE, true),
EXON (EffectFunctionalClass.NONE, true),
INTRON (EffectFunctionalClass.NONE, true),
UTR_3_PRIME (EffectFunctionalClass.NONE, true),
DOWNSTREAM (EffectFunctionalClass.NONE, true),
INTRON_CONSERVED (EffectFunctionalClass.NONE, true),
INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true),
REGULATION (EffectFunctionalClass.NONE, true),
CUSTOM (EffectFunctionalClass.NONE, true),
WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true);
private final EffectFunctionalClass functionalClass;
private final boolean isModifier;
EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) {
this.functionalClass = functionalClass;
this.isModifier = isModifier;
}
public EffectFunctionalClass getFunctionalClass() {
return functionalClass;
}
public boolean isModifier() {
return isModifier;
}
}
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of
// classifying some of the LOW impact effects as MODIFIERs.
public enum EffectImpact {
MODIFIER (0),
LOW (1),
MODERATE (2),
HIGH (3);
private final int severityRating;
EffectImpact ( int severityRating ) {
this.severityRating = severityRating;
}
public boolean isHigherImpactThan ( EffectImpact other ) {
return this.severityRating > other.severityRating;
}
public boolean isSameImpactAs ( EffectImpact other ) {
return this.severityRating == other.severityRating;
}
}
// SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information.
public enum EffectCoding {
CODING,
NON_CODING,
UNKNOWN
}
// We assign a functional class to each SnpEff effect.
public enum EffectFunctionalClass {
NONE (0),
SILENT (1),
MISSENSE (2),
NONSENSE (3);
private final int priority;
EffectFunctionalClass ( int priority ) {
this.priority = priority;
}
public boolean isHigherPriorityThan ( EffectFunctionalClass other ) {
return this.priority > other.priority;
}
}
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
// Make sure that we actually have a valid SnpEff rod binding (just in case the user specified -A SnpEff
// without providing a SnpEff rod via --snpEffFile):
validateRodBinding(walker.getSnpEffRodBinding());
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
// Make sure that the SnpEff version number and command-line header lines are present in the VCF header of
// the SnpEff rod, and that the file was generated by a supported version of SnpEff:
VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName());
VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY);
VCFHeaderLine snpEffCommandLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_COMMAND_LINE_KEY);
checkSnpEffVersion(snpEffVersionLine);
checkSnpEffCommandLine(snpEffCommandLine);
// If everything looks ok, add the SnpEff version number and command-line header lines to the
// header of the VCF output file, changing the key names so that our output file won't be
// mistaken in the future for a SnpEff output file:
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_VERSION_LINE_KEY, snpEffVersionLine.getValue()));
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue()));
}
public Map<String, Object> annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
RodBinding<SnpEffFeature> snpEffRodBinding = walker.getSnpEffRodBinding();
validateRodBinding(snpEffRodBinding);
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
List<SnpEffFeature> features = tracker.getValues(snpEffRodBinding, ref.getLocus());
// Get only SnpEff records that start at this locus, not merely span it:
List<VariantContext> snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus());
// Add only annotations for one of the most biologically-significant effects as defined in
// the SnpEffConstants class:
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features);
if ( mostSignificantEffect == null ) {
// Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate.
// If there is more than one such record, we only need to pick the first one, since the biological
// effects will be the same across all such records:
VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc);
if ( matchingRecord == null ) {
return null;
}
return generateAnnotations(mostSignificantEffect);
// Parse the SnpEff INFO field annotation from the matching record into individual effect objects:
List<SnpEffEffect> effects = parseSnpEffRecord(matchingRecord);
if ( effects.size() == 0 ) {
return null;
}
// Add only annotations for one of the most biologically-significant effects from this set:
SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects);
return mostSignificantEffect.getAnnotations();
}
private void validateRodBinding ( RodBinding<SnpEffFeature> snpEffRodBinding ) {
private void validateRodBinding ( RodBinding<VariantContext> snpEffRodBinding ) {
if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) {
throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " +
"as a rodbinding on the command line, but no SnpEff rodbinding was found.");
throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " +
"as a rodbinding on the command line via the --snpEffFile option, but " +
"no SnpEff rodbinding was found.");
}
}
private SnpEffFeature getMostSignificantEffect ( List<SnpEffFeature> snpEffFeatures ) {
SnpEffFeature mostSignificantEffect = null;
private void checkSnpEffVersion ( VCFHeaderLine snpEffVersionLine ) {
if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) {
throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " +
"input file, and so could not verify that the file was generated by a supported version of SnpEff (" +
Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
}
for ( SnpEffFeature snpEffFeature : snpEffFeatures ) {
String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0];
if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) {
throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " +
"is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS));
}
}
private void checkSnpEffCommandLine ( VCFHeaderLine snpEffCommandLine ) {
if ( snpEffCommandLine == null || snpEffCommandLine.getValue() == null || snpEffCommandLine.getValue().trim().length() == 0 ) {
throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY + " entry in the VCF header for the SnpEff " +
"input file, which should be added by all supported versions of SnpEff (" +
Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
}
}
private boolean isSupportedSnpEffVersion ( String versionString ) {
for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) {
if ( supportedVersion.equals(versionString) ) {
return true;
}
}
return false;
}
private VariantContext getMatchingSnpEffRecord ( List<VariantContext> snpEffRecords, VariantContext vc ) {
for ( VariantContext snpEffRecord : snpEffRecords ) {
if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) {
return snpEffRecord;
}
}
return null;
}
private List<SnpEffEffect> parseSnpEffRecord ( VariantContext snpEffRecord ) {
List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>();
Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
if ( effectFieldValue == null ) {
return parsedEffects;
}
// The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a String.
// We can have either in the case of SnpEff, since there may be one or more than one effect in this record.
List<String> individualEffects;
if ( effectFieldValue instanceof List ) {
individualEffects = (List<String>)effectFieldValue;
}
else {
individualEffects = Arrays.asList((String)effectFieldValue);
}
for ( String effectString : individualEffects ) {
String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);
if ( effectNameAndMetadata.length != 2 ) {
logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s",
snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
continue;
}
String effectName = effectNameAndMetadata[0];
String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);
SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);
if ( parsedEffect.isWellFormed() ) {
parsedEffects.add(parsedEffect);
}
else {
logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString));
}
}
return parsedEffects;
}
private SnpEffEffect getMostSignificantEffect ( List<SnpEffEffect> effects ) {
SnpEffEffect mostSignificantEffect = null;
for ( SnpEffEffect effect : effects ) {
if ( mostSignificantEffect == null ||
snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
effect.isHigherImpactThan(mostSignificantEffect) ) {
mostSignificantEffect = snpEffFeature;
mostSignificantEffect = effect;
}
}
return mostSignificantEffect;
}
private Map<String, Object> generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(getKeyNames().size()));
if ( mostSignificantEffect.hasGeneID() )
annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
if ( mostSignificantEffect.hasGeneName() )
annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
if ( mostSignificantEffect.hasTranscriptID() )
annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
if ( mostSignificantEffect.hasExonID() )
annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
if ( mostSignificantEffect.hasExonRank() )
annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
if ( mostSignificantEffect.isNonCodingGene() )
annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
if ( mostSignificantEffect.hasEffectExtraInformation() )
annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
if ( mostSignificantEffect.hasOldAndNewAA() )
annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
if ( mostSignificantEffect.hasOldAndNewCodon() )
annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
if ( mostSignificantEffect.hasCodonNum() )
annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
if ( mostSignificantEffect.hasCdsSize() )
annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
return annotations;
}
public List<String> getKeyNames() {
return Arrays.asList( GENE_ID_KEY,
GENE_NAME_KEY,
TRANSCRIPT_ID_KEY,
EXON_ID_KEY,
EXON_RANK_KEY,
WITHIN_NON_CODING_GENE_KEY,
EFFECT_KEY,
EFFECT_IMPACT_KEY,
EFFECT_EXTRA_INFORMATION_KEY,
OLD_NEW_AA_KEY,
OLD_NEW_CODON_KEY,
CODON_NUM_KEY,
CDS_SIZE_KEY
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
InfoFieldKey.IMPACT_KEY.getKeyName(),
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
InfoFieldKey.EXON_ID_KEY.getKeyName(),
InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()
);
}
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values()))
);
}
/**
* Helper class to parse, validate, and store a single SnpEff effect and its metadata.
*/
protected static class SnpEffEffect {
private EffectType effect;
private EffectImpact impact;
private String codonChange;
private String aminoAcidChange;
private String geneName;
private String geneBiotype;
private EffectCoding coding;
private String transcriptID;
private String exonID;
private String parseError = null;
private boolean isWellFormed = true;
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
// Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
// errors come after warnings, not vice versa:
private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
private static final int SNPEFF_CODING_FIELD_INDEX = 5;
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
parseEffectName(effectName);
parseEffectMetadata(effectMetadata);
}
private void parseEffectName ( String effectName ) {
try {
effect = EffectType.valueOf(effectName);
}
catch ( IllegalArgumentException e ) {
parseError(String.format("%s is not a recognized effect type", effectName));
}
}
private void parseEffectMetadata ( String[] effectMetadata ) {
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
}
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
}
else {
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length));
}
return;
}
if ( effect != null && effect.isModifier() ) {
impact = EffectImpact.MODIFIER;
}
else {
try {
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
}
catch ( IllegalArgumentException e ) {
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
}
}
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
try {
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
}
catch ( IllegalArgumentException e ) {
parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX]));
}
}
else {
coding = EffectCoding.UNKNOWN;
}
transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
}
private void parseError ( String message ) {
isWellFormed = false;
// Cache only the first error encountered:
if ( parseError == null ) {
parseError = message;
}
}
public boolean isWellFormed() {
return isWellFormed;
}
public String getParseError() {
return parseError == null ? "" : parseError;
}
public boolean isCoding() {
return coding == EffectCoding.CODING;
}
public boolean isHigherImpactThan ( SnpEffEffect other ) {
// If one effect is within a coding gene and the other is not, the effect that is
// within the coding gene has higher impact:
if ( isCoding() && ! other.isCoding() ) {
return true;
}
else if ( ! isCoding() && other.isCoding() ) {
return false;
}
// Otherwise, both effects are either in or not in a coding gene, so we compare the impacts
// of the effects themselves. Effects with the same impact are tie-broken using the
// functional class of the effect:
if ( impact.isHigherImpactThan(other.impact) ) {
return true;
}
else if ( impact.isSameImpactAs(other.impact) ) {
return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass());
}
return false;
}
public Map<String, Object> getAnnotations() {
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(InfoFieldKey.values().length));
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString());
return annotations;
}
private void addAnnotation ( Map<String, Object> annotations, String keyName, String keyValue ) {
// Only add annotations for keys associated with non-empty values:
if ( keyValue != null && keyValue.trim().length() > 0 ) {
annotations.put(keyName, keyValue);
}
}
}
}

View File

@ -17,6 +17,9 @@ import java.util.List;
import java.util.Map;
/**
* Fraction of reads containing spanning deletions at this site.
*/
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -19,12 +20,9 @@ import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 6/29/11
* Time: 3:14 PM
* To change this template use File | Settings | File Templates.
* Counts of bases from SLX, 454, and SOLiD at this site
*/
@Hidden
public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation {
private String nSLX = "NumSLX";
private String n454 ="Num454";

View File

@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -86,14 +85,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
/**
* The INFO field will be annotated with information on the most biologically-significant effect
* listed in the SnpEff output file for each variant.
*/
@Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
public RodBinding<SnpEffFeature> snpEffFile;
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
public RodBinding<VariantContext> snpEffFile;
public RodBinding<VariantContext> getSnpEffRodBinding() { return snpEffFile; }
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
@ -162,6 +162,12 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
@Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation")
public String familyStr = null;
@Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio")
public double minGenotypeQualityP = 0.0;
private VariantAnnotatorEngine engine;
private Collection<VariantContext> indelBufferContext;
@ -203,9 +209,9 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
}
if ( USE_ALL_ANNOTATIONS )
engine = new VariantAnnotatorEngine(this);
engine = new VariantAnnotatorEngine(this, getToolkit());
else
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit());
engine.initializeExpressions(expressionsToUse);
// setup the header fields
@ -217,6 +223,8 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
hInfo.add(line);
}
engine.invokeAnnotationInitializationMethods(hInfo);
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);

View File

@ -26,13 +26,11 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -49,6 +47,7 @@ public class VariantAnnotatorEngine {
private HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
private AnnotatorCompatibleWalker walker;
private GenomeAnalysisEngine toolkit;
private static class VAExpression {
@ -74,16 +73,18 @@ public class VariantAnnotatorEngine {
}
// use this constructor if you want all possible annotations
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) {
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
this.walker = walker;
this.toolkit = toolkit;
requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
initializeDBs();
}
// use this constructor if you want to select specific annotations (and/or interfaces)
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker) {
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
this.walker = walker;
this.toolkit = toolkit;
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
initializeDBs();
}
@ -113,6 +114,16 @@ public class VariantAnnotatorEngine {
dbAnnotations.put(rod, rod.getName());
}
public void invokeAnnotationInitializationMethods( Set<VCFHeaderLine> headerLines ) {
for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
annotation.initialize(walker, toolkit, headerLines);
}
for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.initialize(walker, toolkit, headerLines);
}
}
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
@ -9,8 +8,9 @@ import java.util.List;
public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings
public abstract RodBinding<SnpEffFeature> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getVariantRodBinding();
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
}
}

View File

@ -24,18 +24,18 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
import java.util.Map;
import java.util.Set;
@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
public abstract class VariantAnnotatorAnnotation {
// return the INFO keys
public abstract List<String> getKeyNames();
// initialization method (optional for subclasses, and therefore non-abstract)
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) { }
}

View File

@ -175,21 +175,16 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
}
BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2);
// ignore places where we don't have a variant
if ( beagleR2Feature == null )
return 0;
BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs);
// ignore places where we don't have a variant
if ( beagleProbsFeature == null )
return 0;
BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased);
// ignore places where we don't have a variant
if ( beaglePhasedFeature == null )
return 0;
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
{
vcfWriter.add(vc_input);
return 1;
}
// get reference base for current position
byte refByte = ref.getBase();

View File

@ -63,20 +63,32 @@ import java.util.*;
* <h2>Input</h2>
* <p>
* One or more bam files (with proper headers) to be analyzed for coverage statistics
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
* </p>
*
* <p>
*(Optional) A REFSEQ Rod to aggregate coverage to the gene level
* <p>
* (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
*</p></p>
* <h2>Output</h2>
* <p>
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
* </p><p>
* - no suffix: per locus coverage
* </p><p>
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
* </p><p>
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
* </p><p>
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
* </p><p>
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
* </p><p>
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
* </p><p>
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
* </p><p>
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
* </p><p>
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
* </p>
*
@ -84,7 +96,7 @@ import java.util.*;
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantEval \
* -T DepthOfCoverage \
* -o file_name_base \
* -I input_bams.list
* [-geneList refSeq.sorted.txt] \

View File

@ -43,8 +43,10 @@ import java.util.List;
* Generates an alternative reference sequence over the specified interval.
*
* <p>
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
* Note that if there are multiple variants at a site, it takes the first one seen.
* Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
*
* <h2>Input</h2>
* <p>

View File

@ -42,6 +42,9 @@ import java.io.PrintStream;
*
* <p>
* The output format can be partially controlled using the provided command-line arguments.
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
* separate fasta sequence (named numerically in order).
*
* <h2>Input</h2>
* <p>

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.genotype;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
* Time: 6:46:09 PM
* To change this template use File | Settings | File Templates.
*/
public enum DiploidGenotype {
enum DiploidGenotype {
AA ('A', 'A'),
AC ('A', 'C'),
AG ('A', 'G'),

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
/**
* Created by IntelliJ IDEA.

View File

@ -30,7 +30,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.FragmentPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -276,8 +275,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
if ( elt.isReducedRead() ) {
// reduced read representation
byte qual = elt.getReducedQual();
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
return elt.getReducedCount(); // we added nObs bases here
if ( BaseUtils.isRegularBase( elt.getBase() )) {
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
return elt.getReducedCount(); // we added nObs bases here
} else // odd bases or deletions => don't use them
return 0;
} else {
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.Arrays;

View File

@ -48,27 +48,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// code for testing purposes
//
private final static boolean DEBUG = false;
private final static boolean PRINT_LIKELIHOODS = false;
private final static int N_CYCLES = 1;
private SimpleTimer timerExpt = new SimpleTimer("linearExactBanded");
private SimpleTimer timerGS = new SimpleTimer("linearExactGS");
private final static boolean COMPARE_TO_GS = false;
public enum ExactCalculation {
N2_GOLD_STANDARD,
LINEAR_EXPERIMENTAL
}
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
private boolean SIMPLE_GREEDY_GENOTYPER = false;
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
calcToUse = UAC.EXACT_CALCULATION_TYPE;
}
public void getLog10PNonRef(RefMetaDataTracker tracker,
@ -76,43 +61,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Map<String, Genotype> GLs, Set<Allele>alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
double[] gsPosteriors;
if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here
gsPosteriors = log10AlleleFrequencyPosteriors.clone();
int idxAA = GenotypeType.AA.ordinal();
int idxAB = GenotypeType.AB.ordinal();
int idxBB = GenotypeType.BB.ordinal();
// todo -- remove me after testing
if ( N_CYCLES > 1 ) {
for ( int i = 0; i < N_CYCLES; i++) {
timerGS.restart();
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB);
timerGS.stop();
timerExpt.restart();
linearExactBanded(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
timerExpt.stop();
}
System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n",
timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime());
}
int lastK = -1;
int numAlleles = alleles.size();
final int numAlleles = alleles.size();
final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null;
final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null;
int idxDiag = numAlleles;
int incr = numAlleles - 1;
double[][] posteriorCache = new double[numAlleles-1][];
double[] bestAFguess = new double[numAlleles-1];
for (int k=1; k < numAlleles; k++) {
// multi-allelic approximation, part 1: Ideally
// for each alt allele compute marginal (suboptimal) posteriors -
@ -121,24 +75,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
idxAA = 0;
idxAB = k;
final int idxAA = 0;
final int idxAB = k;
// yy is always element on the diagonal.
// 2 alleles: BBelement 2
// 3 alleles: BB element 3. CC element 5
// 4 alleles:
idxBB = idxDiag;
final int idxBB = idxDiag;
idxDiag += incr--;
// todo - possible cleanup
switch ( calcToUse ) {
case N2_GOLD_STANDARD:
lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
break;
case LINEAR_EXPERIMENTAL:
lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
break;
}
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
if (numAlleles > 2) {
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
@ -153,47 +100,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
}
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
if ( COMPARE_TO_GS ) {
gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB);
double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]);
double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]);
boolean eq = (log10thisPVar == Double.NEGATIVE_INFINITY && log10gsPVar == Double.NEGATIVE_INFINITY) || MathUtils.compareDoubles(log10thisPVar, log10gsPVar, 1e-4) == 0;
if ( ! eq || PRINT_LIKELIHOODS ) {
System.out.printf("----------------------------------------%n");
for (int k=0; k < log10AlleleFrequencyPosteriors.length; k++) {
double x = log10AlleleFrequencyPosteriors[k];
System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k,
x < -1e10 ? Double.NEGATIVE_INFINITY : x, gsPosteriors[k],
log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]);
}
System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n",
ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar);
}
}
}
private static final double[][] getGLs(Map<String, Genotype> GLs) {
double[][] genotypeLikelihoods = new double[GLs.size()+1][];
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
int j = 0;
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.values() ) {
j++;
if ( sample.hasLikelihoods() ) {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
// -------------------------------------------------------------------------------------
//
// Linearized, ~O(N), implementation.
@ -237,90 +162,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
// now with banding
public int linearExactBanded(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
throw new NotImplementedException();
// final int numSamples = GLs.size();
// final int numChr = 2*numSamples;
// final double[][] genotypeLikelihoods = getGLs(GLs);
//
// final ExactACCache logY = new ExactACCache(numSamples+1);
// logY.getkMinus0()[0] = 0.0; // the zero case
//
// double maxLog10L = Double.NEGATIVE_INFINITY;
// boolean done = false;
// int lastK = -1;
// final int BAND_SIZE = 10;
//
// for (int k=0; k <= numChr && ! done; k++ ) {
// final double[] kMinus0 = logY.getkMinus0();
// int jStart = Math.max(k - BAND_SIZE, 1);
// int jStop = Math.min(k + BAND_SIZE, numSamples);
//
// if ( k == 0 ) { // special case for k = 0
// for ( int j=1; j <= numSamples; j++ ) {
// kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
// }
// } else { // k > 0
// final double[] kMinus1 = logY.getkMinus1();
// final double[] kMinus2 = logY.getkMinus2();
// Arrays.fill(kMinus0,0);
//
// for ( int j = jStart; j <= jStop; j++ ) {
// final double[] gl = genotypeLikelihoods[j];
// final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
//
// double aa = Double.NEGATIVE_INFINITY;
// double ab = Double.NEGATIVE_INFINITY;
// if (k < 2*j-1)
// aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
//
// if (k < 2*j)
// ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
//
// double log10Max;
// if (k > 1) {
// final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
// log10Max = approximateLog10SumLog10(aa, ab, bb);
// } else {
// // we know we aren't considering the BB case, so we can use an optimized log10 function
// log10Max = approximateLog10SumLog10(aa, ab);
// }
//
// // finally, update the L(j,k) value
// kMinus0[j] = log10Max - logDenominator;
//
// String offset = Utils.dupString(' ',k);
// System.out.printf("%s%3d %3d %.2f%n", offset, k, j, kMinus0[j]);
// }
// }
//
// // update the posteriors vector
// final double log10LofK = kMinus0[jStop];
// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
//
// // can we abort early?
// lastK = k;
// maxLog10L = Math.max(maxLog10L, log10LofK);
// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
// if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
// done = true;
// }
//
// logY.rotate();
// }
//
// return lastK;
}
public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
final int numSamples = GLs.size();
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final double[][] genotypeLikelihoods = getGLs(GLs);
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
@ -334,14 +181,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods[j];
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
@ -434,10 +281,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
boolean multiAllelicRecord = false;
if (vc.getAlternateAlleles().size() > 1)
multiAllelicRecord = true;
Map<String, Genotype> GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@ -454,7 +297,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.keySet());
sampleIdx = GLs.size();
}
@ -465,6 +308,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
continue;
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
//System.out.print(sample.getKey()+":");
//for (int k=0; k < likelihoods.length; k++)
// System.out.format("%4.2f ",likelihoods[k]);
//System.out.println();
// all likelihoods are essentially the same: skip this sample and will later on force no call.
//sampleIdx++;
continue;
}
sampleIndices.add(sample.getKey());
for (int k=0; k <= AFofMaxLikelihood; k++) {
@ -504,22 +358,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
else {
int newIdx = tracebackArray[k][startIdx];
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
// and will add no-call genotype to GL's in a second pass
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Double.NEGATIVE_INFINITY;
double[] likelihoods = g.getLikelihoods().getAsVector();
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
}
else {
int newIdx = tracebackArray[k][startIdx];;
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
/* System.out.format("Sample: %s GL:",sample);
for (int i=0; i < likelihoods.length; i++)
System.out.format("%1.4f ",likelihoods[i]);
System.out.format("%1.4f, ",likelihoods[i]);
*/
for (int i=0; i < likelihoods.length; i++) {
@ -570,83 +427,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
return calls;
}
// -------------------------------------------------------------------------------------
//
// Gold standard, but O(N^2), implementation.
//
// TODO -- remove me for clarity in this code
//
// -------------------------------------------------------------------------------------
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
int numSamples = GLs.size();
int numChr = 2*numSamples;
double[][] logYMatrix = new double[1+numSamples][1+numChr];
for (int i=0; i <=numSamples; i++)
for (int j=0; j <=numChr; j++)
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
//YMatrix[0][0] = 1.0;
logYMatrix[0][0] = 0.0;
int j=0;
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
j++;
if ( !sample.getValue().hasLikelihoods() )
continue;
Genotype g = GLs.get(sample.getKey());
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
// special treatment for k=0: iteration reduces to:
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA];
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
continue; // regular likelihoods
for (int k=1; k <= 2*j; k++ ) {
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
double logNumerator[];
logNumerator = new double[3];
if (k < 2*j-1)
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
genotypeLikelihoods[idxAA];
else
logNumerator[0] = Double.NEGATIVE_INFINITY;
if (k < 2*j)
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
genotypeLikelihoods[idxAB];
else
logNumerator[1] = Double.NEGATIVE_INFINITY;
if (k > 1)
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
genotypeLikelihoods[idxBB];
else
logNumerator[2] = Double.NEGATIVE_INFINITY;
double logNum = MathUtils.softMax(logNumerator);
//YMatrix[j][k] = num/den;
logYMatrix[j][k] = logNum - logDenominator;
}
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Genotype.NO_NEG_LOG_10PERROR;
myAlleles.add(Allele.NO_CALL);
myAlleles.add(Allele.NO_CALL);
//System.out.println(myAlleles.toString());
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
}
for (int k=0; k <= numChr; k++)
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
return numChr;
return calls;
}
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
@ -657,5 +457,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
}
}
}

View File

@ -32,10 +32,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
@ -70,9 +71,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// gdebug removeme
// todo -cleanup
private HaplotypeIndelErrorModel model;
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
//
private GenomeLoc lastSiteVisited;
private ArrayList<Allele> alleleList;
@ -83,26 +81,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
if (UAC.GSA_PRODUCTION_ONLY == false) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
useOldWrongHorribleHackedUpLikelihoodModel = false;
}
else {
useOldWrongHorribleHackedUpLikelihoodModel = true;
double INSERTION_START_PROBABILITY = 1e-3;
double INSERTION_END_PROBABILITY = 0.5;
double ALPHA_DELETION_PROBABILITY = 1e-3;
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
}
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,UAC.OUTPUT_DEBUG_INDEL_INFO);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
@ -321,7 +300,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
haplotypeMap.clear();
if (getAlleleListFromVCF) {
for( final VariantContext vc_input : tracker.getValues(UAC.alleles) ) {
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
if( vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
@ -382,20 +361,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
}
}
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
if (useOldWrongHorribleHackedUpLikelihoodModel) {
numPrefBases = 20;
hsize=80;
}
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
if (DEBUG)
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
//System.out.println(eventLength);
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, hsize, numPrefBases);
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases);
// For each sample, get genotype likelihoods based on pileup
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
@ -412,17 +388,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
pileup = context.getBasePileup();
if (pileup != null ) {
double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
// for 3 alleles it's 00 01 11 02 12 22
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,
genotypeLikelihoods,
getFilteredDepth(pileup)));
@ -444,4 +412,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get();
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
count++;
}
return count;
}
}

View File

@ -26,16 +26,14 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@ -58,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
}
public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
if ( tracker == null || ref == null || logger == null )
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
VariantContext vc = null;
// search for usable record
for( final VariantContext vc_input : tracker.getValues(allelesBinding) ) {
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
if ( vc == null ) {
vc = vc_input;
} else {
logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record");
}
}
}
return vc;
}
public Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
@ -96,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( alternateAlleleToUse != null ) {
bestAlternateAllele = alternateAlleleToUse.getBases()[0];
} else if ( useAlleleFromVCF ) {
VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger, UAC.alleles);
VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a variant
if ( vc == null )
@ -143,8 +122,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
aList.add(refAllele);
aList.add(altAllele);
double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ;
// normalize in log space so that max element is zero.
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
aList, dlike, getFilteredDepth(pileup)));
aList, MathUtils.normalizeFromLog10(dlike, false, true), getFilteredDepth(pileup)));
}
return refAllele;

View File

@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;

View File

@ -143,35 +143,21 @@ public class UnifiedArgumentCollection {
@Hidden
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
public int INDEL_HAPLOTYPE_SIZE = 80;
@Hidden
@Argument(fullName = "doContextDependentGapPenalties", shortName = "doCDP", doc = "Vary gap penalties by context", required = false)
public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true;
//gdebug+
// experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!!
@Hidden
@Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
public boolean GET_GAP_PENALTIES_FROM_DATA = false;
@Hidden
@Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
// @Hidden
// @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
// public boolean GET_GAP_PENALTIES_FROM_DATA = false;
//
// @Hidden
// @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
// public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@Hidden
@Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
public boolean dovit = false;
@Hidden
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
public boolean GSA_PRODUCTION_ONLY = false;
@Hidden
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
@Hidden
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
@ -191,7 +177,6 @@ public class UnifiedArgumentCollection {
uac.GLmodel = GLmodel;
uac.AFmodel = AFmodel;
uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE;
uac.heterozygosity = heterozygosity;
uac.PCR_error = PCR_error;
uac.GenotypingMode = GenotypingMode;
@ -209,15 +194,10 @@ public class UnifiedArgumentCollection {
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES;
uac.alleles = alleles;
uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA;
uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE;
// todo- arguments to remove
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
uac.dovit = dovit;
uac.GSA_PRODUCTION_ONLY = GSA_PRODUCTION_ONLY;
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
return uac;

View File

@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -127,7 +126,8 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
@ -210,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this);
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this, getToolkit());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
// initialize the header

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
@ -36,13 +37,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
@ -236,10 +235,11 @@ public class UnifiedGenotyperEngine {
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
VariantContext vc;
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles);
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
if ( vcInput == null )
return null;
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles());
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase());
} else {
// deal with bad/non-standard reference bases
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
@ -544,6 +544,21 @@ public class UnifiedGenotyperEngine {
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
private final static double[] binomialProbabilityDepthCache = new double[10000];
static {
for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) {
binomialProbabilityDepthCache[i] = MathUtils.binomialProbability(0, i, 0.5);
}
}
private final double getRefBinomialProb(final int depth) {
if ( depth < binomialProbabilityDepthCache.length )
return binomialProbabilityDepthCache[depth];
else
return MathUtils.binomialProbability(0, depth, 0.5);
}
private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
if ( contexts == null )
return null;
@ -567,7 +582,7 @@ public class UnifiedGenotyperEngine {
depth = context.getExtendedEventPileup().size();
}
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth);
}
return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false);
@ -635,7 +650,7 @@ public class UnifiedGenotyperEngine {
// no extended event pileup
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger, UAC.alleles);
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
if (vcInput == null)
return null;
@ -741,4 +756,23 @@ public class UnifiedGenotyperEngine {
return afcm;
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
if ( tracker == null || ref == null || logger == null )
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
VariantContext vc = null;
// search for usable record
for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) {
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
if ( vc == null ) {
vc = vc_input;
} else {
logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record");
}
}
}
return vc;
}
}

View File

@ -26,9 +26,9 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel {
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
double baseProb = QualityUtils.qualToProb(k);
double baseProb = QualityUtils.qualToProb((byte)k);
baseMatchArray[k] = probToQual(baseProb);

View File

@ -28,9 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -50,36 +51,8 @@ import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Reca
public class PairHMMIndelErrorModel {
public static final int BASE_QUAL_THRESHOLD = 20;
private static final int MATCH_OFFSET = 0;
private static final int X_OFFSET = 1;
private static final int Y_OFFSET = 2;
private static final int DIAG = 0;
private static final int UP = 1;
private static final int LEFT = 2;
private static final int DIAG_GOTO_M = 0;
private static final int DIAG_GOTO_X = 1;
private static final int DIAG_GOTO_Y = 2;
private static final int UP_GOTO_M = 4;
private static final int UP_GOTO_X = 5;
private static final int UP_GOTO_Y = 6;
private static final int LEFT_GOTO_M = 8;
private static final int LEFT_GOTO_X = 9;
private static final int LEFT_GOTO_Y = 10;
private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y};
private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y};
private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y};
private final double logGapOpenProbability;
private final double logGapContinuationProbability;
@ -100,36 +73,13 @@ public class PairHMMIndelErrorModel {
private static final double MIN_GAP_CONT_PENALTY = 10.0;
private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this.
private boolean doViterbi = false;
private final boolean useAffineGapModel = true;
private boolean doContextDependentPenalties = false;
private final double[] GAP_OPEN_PROB_TABLE;
private final double[] GAP_CONT_PROB_TABLE;
private boolean getGapPenaltiesFromFile = false;
private int SMOOTHING = 1;
private int MAX_QUALITY_SCORE = 50;
private int PRESERVE_QSCORES_LESS_THAN = 5;
/////////////////////////////
// Private Member Variables
/////////////////////////////
//copy+
/* private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
protected static final String EOF_MARKER = "EOF";
private long numReadsWithMalformedColorSpace = 0;
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
*/
//copy-
static {
LOG_ONE_HALF= -Math.log10(2.0);
END_GAP_COST = LOG_ONE_HALF;
@ -145,141 +95,9 @@ public class PairHMMIndelErrorModel {
}
}
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit,boolean gpf, File RECAL_FILE) {
this(indelGOP, indelGCP, deb, doCDP, dovit);
this.getGapPenaltiesFromFile = gpf;
// read data from recal file
// gdebug - start copy from TableRecalibrationWalker
/* if (gpf) {
boolean sawEOF = false;
boolean REQUIRE_EOF = false;
int lineNumber = 0;
boolean foundAllCovariates = false;
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
try {
for ( String line : new XReadLines(RECAL_FILE) ) {
lineNumber++;
if ( EOF_MARKER.equals(line) ) {
sawEOF = true;
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
; // Skip over the comment lines, (which start with '#')
}
// Read in the covariates that were used from the input file
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
} else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(",");
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
}
}
if( !foundClass ) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
}
}
}
} else { // Found a line of data
if( !foundAllCovariates ) {
foundAllCovariates = true;
// At this point all the covariates should have been found and initialized
if( requestedCovariates.size() < 2 ) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
}
final boolean createCollapsedTables = true;
// Initialize any covariate member variables using the shared argument collection
for( Covariate cov : requestedCovariates ) {
cov.initialize( RAC );
}
// Initialize the data hashMaps
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
}
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
}
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
} catch ( NumberFormatException e ) {
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
}
if ( !sawEOF ) {
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
if ( REQUIRE_EOF )
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
}
if( dataManager == null ) {
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
}
// Create the tables of empirical quality scores that will be used in the sequential calculation
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
}
// debug end copy
*/
}
/**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
*/
/*
private void addCSVData(final File file, final String line) {
final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
}
final Object[] key = new Object[requestedCovariates.size()];
Covariate cov;
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
key[iii] = cov.getValue( vals[iii] );
}
// Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
}
*/
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) {
this(indelGOP, indelGCP, deb, doCDP);
this.doViterbi = dovit;
}
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) {
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob
this.doContextDependentPenalties = doCDP;
this.DEBUG = deb;
@ -313,132 +131,6 @@ public class PairHMMIndelErrorModel {
}
private double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
// initialize path metric and traceback memories for likelihood computation
double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
for (int i=1; i < X_METRIC_LENGTH; i++) {
pathMetricArray[i][0] = 0;
bestMetricArray[i][0] = UP;
}
for (int j=1; j < Y_METRIC_LENGTH; j++) {
pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
bestMetricArray[0][j] = LEFT;
}
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
byte x = readBases[indI-1];
byte y = haplotypeBases[indJ-1];
byte qual = readQuals[indI-1];
double bestMetric = 0.0;
int bestMetricIdx = 0;
// compute metric for match/mismatch
// workaround for reads whose bases quality = 0,
if (qual < 1)
qual = 1;
if (qual > MAX_CACHED_QUAL)
qual = MAX_CACHED_QUAL;
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
double[] metrics = new double[3];
metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead;
metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
if (doViterbi) {
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(metrics);
pathMetricArray[indI][indJ] = bestMetric;
bestMetricArray[indI][indJ] = bestMetricIdx;
}
}
double bestMetric=0.0;
int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
for (int i=0; i < X_METRIC_LENGTH; i ++ ) {
int j= Y_METRIC_LENGTH-1;
if (pathMetricArray[i][j] > bestMetric) {
bestMetric = pathMetricArray[i][j];
bestI = i;
bestJ = j;
}
}
for (int j=0; j < Y_METRIC_LENGTH; j++ ) {
int i= X_METRIC_LENGTH-1;
if (pathMetricArray[i][j] >= bestMetric) {
bestMetric = pathMetricArray[i][j];
bestI = i;
bestJ = j;
}
}
if (DEBUG && doViterbi) {
String haplotypeString = new String (haplotypeBases);
String readString = new String(readBases);
int i = bestI;
int j = bestJ;
System.out.println("Simple NW");
while (i >0 || j >0) {
bestMetricIdx = bestMetricArray[i][j];
System.out.print(bestMetricIdx);
if (bestMetricIdx == UP) {
// insert gap in Y
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
i--;
} else if (bestMetricIdx == LEFT) {
readString = readString.substring(0,i)+"-"+readString.substring(i);
j--;
}
else {
i--; j--;
}
}
System.out.println("\nAlignment: ");
System.out.println("R:"+readString);
System.out.println("H:"+haplotypeString);
System.out.println();
}
if (DEBUG)
System.out.format("Likelihood: %5.4f\n", bestMetric);
return bestMetric;
}
static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) {
// compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG
@ -479,14 +171,10 @@ public class PairHMMIndelErrorModel {
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
// initialize path metric and traceback memories for likelihood computation
double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
double c,d;
matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY;
for (int i=1; i < X_METRIC_LENGTH; i++) {
@ -494,8 +182,6 @@ public class PairHMMIndelErrorModel {
matchMetricArray[i][0] = Double.NEGATIVE_INFINITY;
YMetricArray[i][0] = Double.NEGATIVE_INFINITY;
XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability;
bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X;
}
for (int j=1; j < Y_METRIC_LENGTH; j++) {
@ -503,188 +189,46 @@ public class PairHMMIndelErrorModel {
matchMetricArray[0][j] = Double.NEGATIVE_INFINITY;
XMetricArray[0][j] = Double.NEGATIVE_INFINITY;
YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability;
bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y;
}
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
int im1 = indI-1;
final int im1 = indI-1;
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
int jm1 = indJ-1;
byte x = readBases[im1];
byte y = haplotypeBases[jm1];
byte qual = readQuals[im1];
double bestMetric = 0.0;
int bestMetricIdx = 0;
// compute metric for match/mismatch
// workaround for reads whose bases quality = 0,
if (qual < 1)
qual = 1;
if (qual > MAX_CACHED_QUAL)
qual = MAX_CACHED_QUAL;
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
double[] metrics = new double[3];
if (doViterbi) {
// update match array
metrics[MATCH_OFFSET] = matchMetricArray[im1][jm1] + pBaseRead;
metrics[X_OFFSET] = XMetricArray[im1][jm1] + pBaseRead;
metrics[Y_OFFSET] = YMetricArray[im1][jm1] + pBaseRead;
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead,
YMetricArray[im1][jm1] + pBaseRead);
final int jm1 = indJ-1;
final byte x = readBases[im1];
final byte y = haplotypeBases[jm1];
final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]);
final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
double bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead,
XMetricArray[im1][jm1] + pBaseRead,
YMetricArray[im1][jm1] + pBaseRead);
matchMetricArray[indI][indJ] = bestMetric;
bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx];
// update X array
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
// When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps
//c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]);
//d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]);
if (getGapPenaltiesFromFile) {
c = currentGOP[im1];
d = logGapContinuationProbability;
} else {
c = currentGOP[jm1];
d = currentGCP[jm1];
}
if (indJ == Y_METRIC_LENGTH-1)
c = d = END_GAP_COST;
if (doViterbi) {
metrics[MATCH_OFFSET] = matchMetricArray[im1][indJ] + c;
metrics[X_OFFSET] = XMetricArray[im1][indJ] + d;
metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability;
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c, XMetricArray[im1][indJ] + d);
final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
XMetricArray[indI][indJ] = bestMetric;
bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx];
// update Y array
//c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]);
//d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]);
if (getGapPenaltiesFromFile) {
c = currentGOP[im1];
d = logGapContinuationProbability;
}
else {
c = currentGOP[jm1];
d = currentGCP[jm1];
}
if (indI == X_METRIC_LENGTH-1)
c = d = END_GAP_COST;
if (doViterbi) {
metrics[MATCH_OFFSET] = matchMetricArray[indI][jm1] + c;
metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability;
metrics[Y_OFFSET] = YMetricArray[indI][jm1] + d;
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c, YMetricArray[indI][jm1] + d);
final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
YMetricArray[indI][indJ] = bestMetric;
bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx];
}
}
double bestMetric;
double metrics[] = new double[3];
int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ];
metrics[X_OFFSET] = XMetricArray[bestI][bestJ];
metrics[Y_OFFSET] = YMetricArray[bestI][bestJ];
if (doViterbi) {
bestTable = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestTable];
}
else
bestMetric = MathUtils.softMax(metrics);
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
XMetricArray[bestI][bestJ],
YMetricArray[bestI][bestJ]);
// Do traceback (needed only for debugging!)
if (DEBUG && doViterbi) {
int bestAction;
int i = bestI;
int j = bestJ;
System.out.println("Affine gap NW");
String haplotypeString = new String (haplotypeBases);
String readString = new String(readBases);
while (i >0 || j >0) {
if (bestTable == X_OFFSET) {
// insert gap in Y
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
bestAction = bestActionArrayX[i][j];
}
else if (bestTable == Y_OFFSET) {
readString = readString.substring(0,i)+"-"+readString.substring(i);
bestAction = bestActionArrayY[i][j];
}
else {
bestAction = bestActionArrayM[i][j];
}
System.out.print(bestAction);
// bestAction contains action to take at next step
// encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table
// bestTable and nextDirection for next step
bestTable = bestAction & 0x3;
int nextDirection = bestAction >> 2;
if (nextDirection == UP) {
i--;
} else if (nextDirection == LEFT) {
j--;
} else { // if (nextDirection == DIAG)
i--; j--;
}
}
System.out.println("\nAlignment: ");
System.out.println("R:"+readString);
System.out.println("H:"+haplotypeString);
System.out.println();
}
if (DEBUG)
System.out.format("Likelihood: %5.4f\n", bestMetric);
@ -707,12 +251,12 @@ public class PairHMMIndelErrorModel {
}
}
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
ReferenceContext ref, int eventLength,
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
ReferenceContext ref, int eventLength,
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
int numHaplotypes = haplotypeMap.size();
double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
final double readLikelihoods[][] = new double[pileup.size()][numHaplotypes];
final int readCounts[] = new int[pileup.size()];
int readIdx=0;
LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
@ -723,34 +267,35 @@ public class PairHMMIndelErrorModel {
System.out.println(new String(ref.getBases()));
}
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
for (Allele a: haplotypeMap.keySet()) {
Haplotype haplotype = haplotypeMap.get(a);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[haplotypeBases.length];
getContextHomopolymerLength(haplotypeBases,hrunProfile);
if (DEBUG) {
System.out.println("Haplotype bases:");
System.out.println(new String(haplotypeBases));
for (int i=0; i < hrunProfile.length; i++)
System.out.format("%d",hrunProfile[i]);
System.out.println();
}
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
// todo -- refactor into separate function
for (Allele a: haplotypeMap.keySet()) {
Haplotype haplotype = haplotypeMap.get(a);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[haplotypeBases.length];
getContextHomopolymerLength(haplotypeBases,hrunProfile);
if (DEBUG) {
System.out.println("Haplotype bases:");
System.out.println(new String(haplotypeBases));
for (int i=0; i < hrunProfile.length; i++)
System.out.format("%d",hrunProfile[i]);
System.out.println();
}
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
}
for (PileupElement p: pileup) {
// > 1 when the read is a consensus read representing multiple independent observations
final boolean isReduced = ReadUtils.isReducedRead(p.getRead());
readCounts[readIdx] = isReduced ? p.getReducedCount() : 1;
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
if (indelLikelihoodMap.containsKey(p)) {
@ -762,61 +307,20 @@ public class PairHMMIndelErrorModel {
}
else {
//System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
if ( isReduced ) {
read = ReadUtils.reducedReadWithReducedQuals(read);
}
if(ReadUtils.is454Read(read)) {
continue;
}
double[] recalQuals = null;
/*
if (getGapPenaltiesFromFile) {
RecalDataManager.parseSAMRecord( read, RAC );
recalQuals = new double[read.getReadLength()];
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = -((double)qualityScore)/10.0;
}
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
if (DEBUG) {
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
read.getAlignmentStart(),
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
read.getCigarString());
byte[] bases = read.getReadBases();
for (int k = 0; k < recalQuals.length; k++) {
System.out.format("%c",bases[k]);
}
System.out.println();
for (int k = 0; k < recalQuals.length; k++) {
System.out.format("%.0f ",recalQuals[k]);
}
System.out.println();
}
} */
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
@ -937,11 +441,6 @@ public class PairHMMIndelErrorModel {
unclippedReadBases.length-numEndClippedBases);
double[] recalCDP = null;
if (getGapPenaltiesFromFile) {
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
}
if (DEBUG) {
System.out.println("Read bases:");
@ -971,27 +470,9 @@ public class PairHMMIndelErrorModel {
System.out.println(new String(haplotypeBases));
}
Double readLikelihood = 0.0;
if (useAffineGapModel) {
double[] currentContextGOP = null;
double[] currentContextGCP = null;
if (doContextDependentPenalties) {
if (getGapPenaltiesFromFile) {
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
} else {
currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
}
}
}
else
readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
readEl.put(a,readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
@ -1004,7 +485,7 @@ public class PairHMMIndelErrorModel {
if (DEBUG) {
System.out.println("\nLikelihood summary");
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
for (readIdx=0; readIdx < pileup.size(); readIdx++) {
System.out.format("Read Index: %d ",readIdx);
for (int i=0; i < readLikelihoods[readIdx].length; i++)
System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]);
@ -1012,123 +493,41 @@ public class PairHMMIndelErrorModel {
}
}
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix
for (int i=0; i < numHaplotypes; i++) {
for (int j=i; j < numHaplotypes; j++){
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
continue;
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
final double li = readLikelihoods[readIdx][i];
final double lj = readLikelihoods[readIdx][j];
final int readCount = readCounts[readIdx];
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF);
}
}
}
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
}
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
int hSize = haplotypeLikehoodMatrix.length;
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
int k=0;
double maxElement = Double.NEGATIVE_INFINITY;
for (int j=0; j < hSize; j++) {
for (int j=0; j < numHaplotypes; j++) {
for (int i=0; i <= j; i++){
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
if (haplotypeLikehoodMatrix[i][j] > maxElement)
maxElement = haplotypeLikehoodMatrix[i][j];
}
}
// renormalize
for (int i=0; i < genotypeLikelihoods.length; i++)
genotypeLikelihoods[i] -= maxElement;
return genotypeLikelihoods;
// renormalize so that max element is zero.
return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
}
/**
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
/*
private byte performSequentialQualityCalculation( final Object... key ) {
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
final Object[] qualityScoreCollapsedKey = new Object[2];
final Object[] covariateCollapsedKey = new Object[3];
// The global quality shift (over the read group only)
readGroupCollapsedKey[0] = key[0];
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
double globalDeltaQ = 0.0;
if( globalRecalDatum != null ) {
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
}
// The shift in quality between reported and empirical
qualityScoreCollapsedKey[0] = key[0];
qualityScoreCollapsedKey[1] = key[1];
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
double deltaQReported = 0.0;
if( qReportedRecalDatum != null ) {
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
}
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
double deltaQCovariateEmpirical;
covariateCollapsedKey[0] = key[0];
covariateCollapsedKey[1] = key[1];
for( int iii = 2; iii < key.length; iii++ ) {
covariateCollapsedKey[2] = key[iii]; // The given covariate
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
if( covariateRecalDatum != null ) {
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
}
}
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
//}
//else {
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
//}
//return newQualityByte;
}
*/
}

View File

@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.*;
/**
* Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
* but this latter functionality is now superceded by UnifiedGenotyper.
*
* <p>
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
* data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
* and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
* or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
* the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
* metrics for the post-processing tools to make the final decision). The calls are performed by default
* from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
* command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
* respectively. Indels are called from the tumor sample and annotated as germline
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
* if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
* is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
* in tumor sample, it will not be reported.
*
* <b>If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
* please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.</b>
* To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
*
* <h2>Input</h2>
* <p>
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
* </p>
*
* <h2>Output</h2>
* <p>
* Indel calls with associated metrics.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SomaticIndelDetector \
* -o indels.vcf \
* -verbose indels.txt
* -I:normal normal.bam \
* -I:tumor tumor.bam
* </pre>
*
*/
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
// @Output
// PrintStream out;
@Output(doc="File to which variants should be written",required=true)
@Output(doc="File to write variants (indels) in VCF format",required=true)
protected VCFWriter vcf_writer = null;
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
@Hidden
@Argument(fullName = "genotype_intervals", shortName = "genotype",
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
public String genotypeIntervalsFile = null;
@Hidden
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
"if the list turns out to be unsorted, it will throw an exception. "+
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
"to sort and keep it in memory (increases memory usage!).")
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
"if the list turns out to be unsorted, it will throw an exception. "+
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
"to sort and keep it in memory (increases memory usage!).")
protected boolean GENOTYPE_NOT_SORTED = false;
@Hidden
@Argument(fullName="unpaired", shortName="unpaired",
doc="Perform unpaired calls (no somatic status detection)", required=false)
@Argument(fullName="unpaired", shortName="unpaired",
doc="Perform unpaired calls (no somatic status detection)", required=false)
boolean call_unpaired = false;
boolean call_somatic ;
boolean call_somatic ;
@Argument(fullName="verboseOutput", shortName="verbose",
doc="Verbose output file in text format", required=false)
java.io.File verboseOutput = null;
@Argument(fullName="verboseOutput", shortName="verbose",
doc="Verbose output file in text format", required=false)
java.io.File verboseOutput = null;
@Argument(fullName="bedOutput", shortName="bed",
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
java.io.File bedOutput = null;
@Argument(fullName="minCoverage", shortName="minCoverage",
doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
int minCoverage = 6;
@Argument(fullName="minCoverage", shortName="minCoverage",
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
int minCoverage = 6;
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
int minNormalCoverage = 4;
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
int minNormalCoverage = 4;
@Argument(fullName="minFraction", shortName="minFraction",
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
double minFraction = 0.3;
@Argument(fullName="minFraction", shortName="minFraction",
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
double minFraction = 0.3;
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
double minConsensusFraction = 0.7;
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
"all indel observations at the site exceeds this threshold", required=false)
double minConsensusFraction = 0.7;
@Argument(fullName="minIndelCount", shortName="minCnt",
doc="Minimum count of reads supporting consensus indel required for making the call. "+
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
"(minIndelCount not met) will not pass.", required=false)
int minIndelCount = 0;
@Argument(fullName="minIndelCount", shortName="minCnt",
doc="Minimum count of reads supporting consensus indel required for making the call. "+
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
"(minIndelCount not met) will not pass.", required=false)
int minIndelCount = 0;
@Argument(fullName="refseq", shortName="refseq",
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
String RefseqFileName = null;
@Argument(fullName="refseq", shortName="refseq",
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
String RefseqFileName = null;
@Argument(fullName="blacklistedLanes", shortName="BL",
doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
"by this application, so they will not contribute indels to consider and will not be counted.", required=false)
PlatformUnitFilterHelper dummy;
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
//@Argument(fullName="blacklistedLanes", shortName="BL",
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
//PlatformUnitFilterHelper dummy;
@Hidden
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
required=false) Boolean DEBUG = false;
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
"May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
"May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
"window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
"than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
"if window is too small",required=false) int WINDOW_SIZE = 200;
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
private WindowContext tumor_context;
private WindowContext normal_context;
private int currentContigIndex = -1;

View File

@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray {
public PhasingRead(int length, int mappingQual) {
super(length);
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual));
this.baseProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseProbs, null);

View File

@ -44,12 +44,12 @@ public class RefSeqDataParser {
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
Map<String, String> entriesToNames = new HashMap<String, String>();
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
if (numRecords != null) {
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
if (numRecords != -1) {
boolean done = false;
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
done = true;
@ -59,14 +59,14 @@ public class RefSeqDataParser {
if (!done) {
for (int i = 1; i <= numRecords; i++) {
String key = nameKeyToUseMultiplePrefix + i;
String name = vc.getAttributeAsStringNoException(key);
String name = vc.getAttributeAsString(key, null);
if (name != null)
entriesToNames.put(key, name);
}
}
}
else { // no entry with the # of records:
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
}

View File

@ -42,6 +42,7 @@ import java.util.*;
*
* <p>Body test</p>
*/
@Hidden
public class DocumentationTest extends RodWalker<Integer, Integer> {
// the docs for the arguments are in the collection
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();

View File

@ -76,6 +76,42 @@ import java.util.Map;
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
*
* The first 20 lines of such a file is shown below.
* * The file begins with a series of comment lines describing:
* ** The number of counted loci
* ** The number of counted bases
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
*
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
*
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*
* <pre>
* # Counted Sites 19451059
* # Counted Bases 56582018
* # Skipped Sites 82666
* # Fraction Skipped 1 / 235 bp
* ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
* SRR006446,11,65,CA,9,1,10
* SRR006446,11,48,TA,10,0,40
* SRR006446,11,67,AA,27,0,40
* SRR006446,11,61,GA,11,1,10
* SRR006446,12,34,CA,47,1,17
* SRR006446,12,30,GA,52,1,17
* SRR006446,12,36,AA,352,1,25
* SRR006446,12,17,TA,182,11,12
* SRR006446,11,48,TG,2,0,40
* SRR006446,11,67,AG,1,0,40
* SRR006446,12,34,CG,9,0,40
* SRR006446,12,30,GG,43,0,40
* ERR001876,4,31,AG,1,0,40
* ERR001876,4,31,AT,2,2,1
* ERR001876,4,31,CA,1,0,40
* </pre>
* </p>
*
* <h2>Examples</h2>

View File

@ -61,7 +61,7 @@ import java.util.List;
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
*</pre>
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
*
*<pre>
* Valid // amplicon is valid
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
@ -72,10 +72,10 @@ import java.util.List;
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
* </p>
* </pre></p>
*
* <h2>Examples</h2>
* <pre></pre>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ValidationAmplicons

View File

@ -55,7 +55,23 @@ import java.util.*;
*
* <h2>Output</h2>
* <p>
* Evaluation tables.
* Evaluation tables detailing the results of the eval modules which were applied.
* For example:
* <pre>
* output.eval.gatkreport:
* ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
* CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
* CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
* CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
* CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
* CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
* CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
* CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
* CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
* CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
* CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
* ...
* </pre>
* </p>
*
* <h2>Examples</h2>
@ -149,12 +165,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false)
private String TRANCHE_FILENAME = null;
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
private File ancestralAlignmentsFile = null;
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
private boolean requireStrictAlleleMatch = false;
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
@ -226,16 +242,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
}
sampleNamesForStratification.add(ALL_SAMPLE_NAME);
// Add select expressions for anything in the tranches file
if ( TRANCHE_FILENAME != null ) {
// we are going to build a few select names automatically from the tranches file
for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) {
logger.info("Adding select for all variant above the pCut of : " + t);
SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod));
SELECT_NAMES.add(String.format("TS-%.2f", t.ts));
}
}
// Initialize select expressions
for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
@ -245,18 +251,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Initialize the set of stratifications and evaluations to use
stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
boolean usingJEXL = false;
for ( VariantStratifier vs : getStratificationObjects() ) {
if ( vs.getClass().getSimpleName().equals("Filter") )
byFilterIsEnabled = true;
else if ( vs.getClass().getSimpleName().equals("Sample") )
perSampleIsEnabled = true;
usingJEXL = usingJEXL || vs.getClass().equals(JexlExpression.class);
}
if ( TRANCHE_FILENAME != null && ! usingJEXL )
throw new UserException.BadArgumentValue("tf", "Requires the JexlExpression ST to enabled");
// Initialize the evaluation contexts
evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null);
@ -378,16 +379,16 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if ( matchingComps.size() == 0 )
return null;
// find the comp which matches the alternate allele from eval
// find the comp which matches both the reference allele and alternate allele from eval
Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
for ( VariantContext comp : matchingComps ) {
Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) )
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) )
return comp;
}
// if none match, just return the first one
return matchingComps.get(0);
// if none match, just return the first one unless we require a strict match
return (requireStrictAlleleMatch ? null : matchingComps.get(0));
}
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }

View File

@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
@DataPoint(description = "number of eval SNP sites")
long nEvalVariants = 0;
@DataPoint(description = "number of comp SNP sites")
long nCompVariants = 0;
@DataPoint(description = "number of eval sites outside of comp sites")
long novelSites = 0;
@ -75,10 +72,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
boolean evalIsGood = eval != null && eval.isVariant();
boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType());
boolean evalIsGood = eval != null && eval.isPolymorphic();
boolean compIsGood = comp != null && comp.isNotFiltered();
if (compIsGood) nCompVariants++; // count the number of comp events
if (evalIsGood) nEvalVariants++; // count the number of eval events
if (compIsGood && evalIsGood) {

View File

@ -100,21 +100,22 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
// added in a proxy check for monomorphic status here.
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) {
if ( vc1.isMonomorphic() ) {
nRefLoci++;
} else {
switch (vc1.getType()) {
case NO_VARIATION:
// shouldn't get here
break;
case SNP:
nVariantLoci++;
nSNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case MNP:
nVariantLoci++;
nMNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case INDEL:
nVariantLoci++;
@ -136,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
String refStr = vc1.getReference().getBaseString().toUpperCase();
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null;
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null;
// if (aaStr.equals(".")) {
// aaStr = refStr;
// }

View File

@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
}
public static Double getPQ(Genotype gt) {
return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
return d == -1 ? null : d;
}
public static boolean topMatchesTop(AllelePair b1, AllelePair b2) {

View File

@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator {
public int getComparisonOrder() { return 1; } // need only the evals
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ! vc1.isBiallelic() && vc1.isIndel() ) {
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
return vc1.toString(); // biallelic sites are output
}
if ( vc1.isIndel() ) {
if ( vc1.isIndel() && vc1.isPolymorphic() ) {
if ( ! vc1.isBiallelic() ) {
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
return vc1.toString(); // biallelic sites are output
}
// only count simple insertions/deletions, not complex indels
if ( vc1.isSimpleInsertion() ) {
indelHistogram.update(vc1.getAlternateAllele(0).length());
} else if ( vc1.isSimpleDeletion() ) {
indelHistogram.update(-vc1.getReference().length());
} else {
throw new ReviewedStingException("Indel type that is not insertion or deletion.");
}
}

View File

@ -270,7 +270,7 @@ public class IndelStatistics extends VariantEvaluator {
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (eval != null ) {
if (eval != null && eval.isPolymorphic()) {
if ( indelStats == null ) {
indelStats = new IndelStats(eval);
}

View File

@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
if ( eval.hasGenotypes() )
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
else if ( eval.hasAttribute("AC") ) {
ac = Integer.valueOf(eval.getAttributeAsString("AC"));
ac = eval.getAttributeAsInt("AC", -1);
}
if ( ac != -1 ) {
@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
}
}
if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) {
if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) {
metrics.incrValue(eval);
}
}

View File

@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
}
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) {
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) {
return null; //no interesting sites
}
if (vc.hasGenotypes()) {
//this maps allele to a count
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
//this maps allele to a count
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
int numHetsHere = 0;
float numGenosHere = 0;
int numIndsHere = 0;
int numHetsHere = 0;
float numGenosHere = 0;
int numIndsHere = 0;
for (Genotype genotype : vc.getGenotypes().values()) {
numIndsHere++;
if (!genotype.isNoCall()) {
//increment stats for heterozygosity
if (genotype.isHet()) {
numHetsHere++;
}
for (Genotype genotype : vc.getGenotypes().values()) {
numIndsHere++;
if (!genotype.isNoCall()) {
//increment stats for heterozygosity
if (genotype.isHet()) {
numHetsHere++;
}
numGenosHere++;
//increment stats for pairwise mismatches
numGenosHere++;
//increment stats for pairwise mismatches
for (Allele allele : genotype.getAlleles()) {
if (allele.isNonNull() && allele.isCalled()) {
String alleleString = allele.toString();
alleleCounts.putIfAbsent(alleleString, 0);
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
}
for (Allele allele : genotype.getAlleles()) {
if (allele.isNonNull() && allele.isCalled()) {
String alleleString = allele.toString();
alleleCounts.putIfAbsent(alleleString, 0);
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
}
}
}
if (numGenosHere > 0) {
//only if have one called genotype at least
this.numSites++;
}
if (numGenosHere > 0) {
//only if have one called genotype at least
this.numSites++;
this.totalHet += numHetsHere / numGenosHere;
this.totalHet += numHetsHere / numGenosHere;
//compute based on num sites
float harmonicFactor = 0;
for (int i = 1; i <= numIndsHere; i++) {
harmonicFactor += 1.0 / i;
}
this.thetaRegionNumSites += 1.0 / harmonicFactor;
//compute based on num sites
float harmonicFactor = 0;
for (int i = 1; i <= numIndsHere; i++) {
harmonicFactor += 1.0 / i;
}
this.thetaRegionNumSites += 1.0 / harmonicFactor;
//now compute pairwise mismatches
float numPairwise = 0;
float numDiffs = 0;
for (String allele1 : alleleCounts.keySet()) {
int allele1Count = alleleCounts.get(allele1);
//now compute pairwise mismatches
float numPairwise = 0;
float numDiffs = 0;
for (String allele1 : alleleCounts.keySet()) {
int allele1Count = alleleCounts.get(allele1);
for (String allele2 : alleleCounts.keySet()) {
if (allele1.compareTo(allele2) < 0) {
continue;
}
if (allele1 .compareTo(allele2) == 0) {
numPairwise += allele1Count * (allele1Count - 1) * .5;
for (String allele2 : alleleCounts.keySet()) {
if (allele1.compareTo(allele2) < 0) {
continue;
}
if (allele1 .compareTo(allele2) == 0) {
numPairwise += allele1Count * (allele1Count - 1) * .5;
}
else {
int allele2Count = alleleCounts.get(allele2);
numPairwise += allele1Count * allele2Count;
numDiffs += allele1Count * allele2Count;
}
}
else {
int allele2Count = alleleCounts.get(allele2);
numPairwise += allele1Count * allele2Count;
numDiffs += allele1Count * allele2Count;
}
}
}
if (numPairwise > 0) {
this.totalAvgDiffs += numDiffs / numPairwise;
}
if (numPairwise > 0) {
this.totalAvgDiffs += numDiffs / numPairwise;
}
}

View File

@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
}
public void updateTiTv(VariantContext vc, boolean updateStandard) {
if (vc != null && vc.isSNP() && vc.isBiallelic()) {
if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) {
if (VariantContextUtils.isTransition(vc)) {
if (updateStandard) nTiInComp++;
else nTi++;
@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
else nTv++;
}
String refStr = vc.getReference().getBaseString().toUpperCase();
String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase();
if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) {
BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]);
//System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType);
if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) {
nTiDerived++;
} else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) {
nTvDerived++;
if (vc.hasAttribute("ANCESTRALALLELE")) {
final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase();
if ( ! aaStr.equals(".") ) {
switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) {
case TRANSITION: nTiDerived++; break;
case TRANSVERSION: nTvDerived++; break;
default: break;
}
}
}
}

View File

@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
public SiteStatus calcSiteStatus(VariantContext vc) {
if ( vc == null ) return SiteStatus.NO_CALL;
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
if ( ! vc.isVariant() ) return SiteStatus.MONO;
if ( vc.isMonomorphic() ) return SiteStatus.MONO;
if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
int ac = 0;
@ -130,10 +131,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
//// System.out.printf(" ac = %d%n", ac);
}
else
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
} else if ( vc.hasGenotypes() ) {
return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO;
} else {
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
//return SiteStatus.NO_CALL; // we can't figure out what to do

View File

@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator {
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String interesting = null;
if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
if( titvStats == null ) { titvStats = new TiTvStats(); }
titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));

View File

@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier {
if (eval != null) {
int AC = -1;
if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) {
AC = eval.getAttributeAsInt("AC");
AC = eval.getAttributeAsInt("AC", 0);
} else if ( eval.isVariant() ) {
for (Allele allele : eval.getAlternateAlleles())
AC = Math.max(AC, eval.getChromosomeCount(allele));

View File

@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier {
if (eval != null) {
try {
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3))));
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3))));
} catch (Exception e) {
return relevantStates;
}

View File

@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier {
Integer frame = null;
if (eval.hasAttribute("refseq.functionalClass")) {
aa = eval.getAttributeAsString("refseq.variantAA");
frame = eval.getAttributeAsInt("refseq.frame");
aa = eval.getAttributeAsString("refseq.variantAA", null);
frame = eval.getAttributeAsInt("refseq.frame", 0);
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
String newtype = eval.getAttributeAsString(key);
String newtype = eval.getAttributeAsString(key, null);
if ( newtype != null &&
( type == null ||
@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier {
type = newtype;
String aakey = String.format("refseq.variantAA_%d", annotationId);
aa = eval.getAttributeAsString(aakey);
aa = eval.getAttributeAsString(aakey, null);
if (aa != null) {
String framekey = String.format("refseq.frame_%d", annotationId);
if (eval.hasAttribute(framekey)) {
frame = eval.getAttributeAsInt(framekey);
frame = eval.getAttributeAsInt(framekey, 0);
}
}
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
@ -11,25 +12,34 @@ import java.util.List;
* Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
*/
public class FunctionalClass extends VariantStratifier {
@Override
public void initialize() {
states.add("all");
states.add("silent");
states.add("missense");
states.add("nonsense");
public enum FunctionalType {
silent,
missense,
nonsense
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
@Override
public void initialize() {
states.add("all");
for ( FunctionalType type : FunctionalType.values() )
states.add(type.name());
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
ArrayList<String> relevantStates = new ArrayList<String>();
relevantStates.add("all");
if (eval != null && eval.isVariant()) {
String type = null;
FunctionalType type = null;
if (eval.hasAttribute("refseq.functionalClass")) {
type = eval.getAttributeAsString("refseq.functionalClass");
try {
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null));
} catch ( Exception e ) {} // don't error out if the type isn't supported
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@ -37,24 +47,36 @@ public class FunctionalClass extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
String newtype = eval.getAttributeAsString(key);
if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
( type == null ||
( type.equals("silent") && !newtype.equals("silent") ) ||
( type.equals("missense") && newtype.equals("nonsense") ) )
) {
type = newtype;
String newtypeStr = eval.getAttributeAsString(key, null);
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
try {
FunctionalType newType = FunctionalType.valueOf(newtypeStr);
if ( type == null ||
( type == FunctionalType.silent && newType != FunctionalType.silent ) ||
( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) {
type = newType;
}
} catch ( Exception e ) {} // don't error out if the type isn't supported
}
annotationId++;
} while (eval.hasAttribute(key));
} else if ( eval.hasAttribute(SnpEff.InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()) ) {
try {
SnpEff.EffectFunctionalClass snpEffFunctionalClass = SnpEff.EffectFunctionalClass.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()).toString());
if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.NONSENSE )
type = FunctionalType.nonsense;
else if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.MISSENSE )
type = FunctionalType.missense;
else if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.SILENT )
type = FunctionalType.silent;
}
catch ( Exception e ) {} // don't error out if the type isn't supported
}
if (type != null) {
if (type.equals("silent")) { relevantStates.add("silent"); }
else if (type.equals("missense")) { relevantStates.add("missense"); }
else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
if ( type != null ) {
relevantStates.add(type.name());
}
}

View File

@ -277,7 +277,7 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples
*/
public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection<String> sampleNames) {
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values());
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles());
HashMap<String, Object> newAts = new HashMap<String, Object>(vcsub.getAttributes());
@ -354,7 +354,7 @@ public class VariantEvalUtils {
private void addMapping(HashMap<String, Set<VariantContext>> mappings, String sample, VariantContext vc) {
if ( !mappings.containsKey(sample) )
mappings.put(sample, new HashSet<VariantContext>());
mappings.put(sample, new LinkedHashSet<VariantContext>());
mappings.get(sample).add(vc);
}

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/12/11
*/
public class TrainingSet {
public RodBinding<VariantContext> rodBinding;
public boolean isKnown = false;
public boolean isTraining = false;
public boolean isAntiTraining = false;
public boolean isTruth = false;
public boolean isConsensus = false;
public double prior = 0.0;
protected final static Logger logger = Logger.getLogger(TrainingSet.class);
public TrainingSet( final RodBinding<VariantContext> rodBinding) {
this.rodBinding = rodBinding;
final Tags tags = rodBinding.getTags();
final String name = rodBinding.getName();
// Parse the tags to decide which tracks have which properties
if( tags != null ) {
isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
}
// Report back to the user which tracks were found and the properties that were detected
if( !isConsensus && !isAntiTraining ) {
logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
} else if( isConsensus ) {
logger.info( String.format( "Found consensus track: %s", name) );
} else {
logger.info( String.format( "Found bad sites training track: %s", name) );
}
}
}

Some files were not shown because too many files have changed in this diff Show More