Bringing latest updates of ReduceReads to the master repository

This commit is contained in:
Mauricio Carneiro 2011-09-20 16:35:09 -04:00
commit 758ecf2d43
141 changed files with 3245 additions and 2616 deletions

299
build.xml
View File

@ -163,6 +163,14 @@
<!-- Remove old versions of ivy jars AFTER the ivy:retrieve has been class loaded. -->
<delete file="${ivy.jar.dir}/ivy-2.0.0.jar"/>
<delete file="${ivy.jar.dir}/ivy-2.2.0-rc1.jar"/>
<!--
An old versions of the ivy-1.4.1.xml does not contain /ivy-module/configuration/conf/@name="compile".
Easier to upgrade to 1.4.4 than try to deal with xmlproperty and conditional deletion in ant.
Just in case we remove explicit 1.4.4 and go back to 1.4.1, try to clean out the file for now.
-->
<delete file="${ivy.home}/cache/javax.mail/mail/ivy-1.4.1.xml"/>
<delete file="${ivy.home}/cache/javax.mail/mail/ivydata-1.4.1.properties"/>
<delete file="${ivy.home}/cache/javax.mail/mail/jars/mail-1.4.1.jar"/>
</target>
<target name="init.buildall">
@ -709,53 +717,6 @@
</antcall>
</target>
<target name="test.init.compile">
<mkdir dir="${java.test.classes}"/>
<mkdir dir="${scala.test.classes}"/>
<antcall target="resolve">
<param name="ivy.conf" value="test"/>
</antcall>
</target>
<target name="test.java.compile" depends="init.buildall,dist,test.init.compile">
<echo message="Sting: Compiling test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
<src path="${java.public.test.sources}"/>
<src path="${java.private.test.sources}"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
<target name="test.scala.compile" depends="test.java.compile,scala.compile" if="scala.include">
<echo message="Scala: Compiling test cases!"/>
<antcall target="resolve">
<param name="ivy.conf" value="test"/>
</antcall>
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
<src path="${scala.public.test.sources}" />
<src path="${scala.private.test.sources}" />
<include name="**/*.scala"/>
<classpath>
<path refid="scala.dependencies"/>
<pathelement location="${scala.test.classes}"/>
<pathelement location="${java.test.classes}"/>
<pathelement location="${lib.dir}/testng-5.14.1.jar"/>
</classpath>
</scalac>
</target>
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
<!-- new scala target -->
<target name="scala" description="build the scala directory">
@ -769,20 +730,113 @@
<!-- ***************************************************************************** -->
<!-- where to put reports and tests-->
<property name="report" value="${build.dir}/report"/>
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
<property name="test.output" value="${dist.dir}/test"/>
<property name="java.public.test.sources" value="public/java/test"/>
<property name="java.private.test.sources" value="private/java/test"/>
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
<property name="java.public.test.classes" value="${java.test.classes}/public"/>
<property name="java.private.test.classes" value="${java.test.classes}/private"/>
<property name="java.public.test.sources" value="${public.dir}/java/test"/>
<property name="java.private.test.sources" value="${private.dir}/java/test"/>
<property name="scala.test.classes" value="${build.dir}/scala/testclasses"/>
<property name="scala.public.test.sources" value="public/scala/test"/>
<property name="scala.private.test.sources" value="private/scala/test"/>
<property name="scala.public.test.classes" value="${scala.test.classes}/public"/>
<property name="scala.private.test.classes" value="${scala.test.classes}/private"/>
<property name="scala.public.test.sources" value="${public.dir}/scala/test"/>
<property name="scala.private.test.sources" value="${private.dir}/scala/test"/>
<property name="testng.jar" value="${lib.dir}/testng-5.14.1.jar"/>
<!-- provide a ceiling on the memory that unit/integration tests can consume. -->
<property name="test.maxmemory" value="4g"/>
<target name="test.init.compile">
<mkdir dir="${java.test.classes}"/>
<mkdir dir="${scala.test.classes}"/>
<antcall target="resolve">
<param name="ivy.conf" value="test"/>
</antcall>
</target>
<target name="test.java.public.compile" depends="dist,test.init.compile">
<mkdir dir="${java.public.test.classes}"/>
<echo message="Sting: Compiling public test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.public.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
<src path="${java.public.test.sources}"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
<target name="test.java.private.compile" depends="dist,test.init.compile,test.java.public.compile" if="include.private">
<mkdir dir="${java.private.test.classes}"/>
<echo message="Sting: Compiling private test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.private.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
<src path="${java.private.test.sources}"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile"/>
<target name="test.scala.public.compile" depends="test.java.compile,scala.compile" if="scala.include">
<mkdir dir="${scala.public.test.classes}"/>
<echo message="Scala: Compiling public test cases!"/>
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.public.test.classes}" deprecation="yes" unchecked="yes">
<src path="${scala.public.test.sources}" />
<classpath>
<path refid="scala.dependencies"/>
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${testng.jar}"/>
</classpath>
</scalac>
</target>
<target name="test.scala.private.compile" depends="test.java.compile,scala.compile,test.scala.public.compile" if="include.scala.private">
<mkdir dir="${scala.private.test.classes}"/>
<echo message="Scala: Compiling private test cases!"/>
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.private.test.classes}" deprecation="yes" unchecked="yes">
<src path="${scala.private.test.sources}" />
<classpath>
<path refid="scala.dependencies"/>
<pathelement location="${scala.public.test.classes}"/>
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${java.private.test.classes}"/>
<pathelement location="${testng.jar}"/>
</classpath>
</scalac>
</target>
<target name="test.scala.compile" depends="test.scala.public.compile,test.scala.private.compile"/>
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
<!-- TEST -->
<macrodef name="run-test">
<attribute name="testtype"/>
<attribute name="outputdir"/>
<attribute name="runfailed"/>
<sequential>
<condition property="run.failed.tests">
<equals arg1="@{runfailed}" arg2="true"/>
</condition>
<!-- Get the pipeline run type. Default to dry. -->
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
@ -792,10 +846,10 @@
<isset property="include.contracts" />
</condition>
<mkdir dir="${report}/@{testtype}"/>
<mkdir dir="@{outputdir}"/>
<echo message="Sting: Running @{testtype} test cases!"/>
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
<testng outputDir="${report}/@{testtype}"
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
<testng outputDir="@{outputdir}"
haltOnFailure="false" failureProperty="test.failure"
verbose="2"
workingDir="${basedir}"
@ -813,117 +867,108 @@
<pathelement location="${java.classes}" />
<pathelement location="${scala.classes}" />
<pathelement location="${java.contracts}" />
<pathelement location="${java.test.classes}" />
<pathelement location="${scala.test.classes}" />
<pathelement location="${java.public.test.classes}" />
<pathelement location="${java.private.test.classes}" />
<pathelement location="${scala.public.test.classes}" />
<pathelement location="${scala.private.test.classes}" />
</classpath>
<classfileset dir="${java.test.classes}" includes="**/@{testtype}.class"/>
<classfileset dir="${scala.test.classes}" includes="**/@{testtype}*.class" />
<classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}.class" if="include.private"/>
</classfileset>
<classfileset dir="${scala.public.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}*.class" if="scala.include"/>
</classfileset>
<classfileset dir="${scala.private.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}*.class" if="include.scala.private"/>
</classfileset>
<xmlfileset dir="${basedir}">
<include name="@{testtype}" if="run.failed.tests"/>
</xmlfileset>
</testng>
<!-- generate a report for Bamboo or Hudson to read in -->
<junitreport todir="${report}/@{testtype}">
<fileset dir="${report}/@{testtype}">
<junitreport todir="@{outputdir}">
<fileset dir="@{outputdir}">
<include name="*/*.xml"/>
</fileset>
<report format="noframes" todir="${report}/@{testtype}"/>
<report format="noframes" todir="@{outputdir}"/>
</junitreport>
<fail message="test failed" if="test.failure" />
</sequential>
</macrodef>
<!-- FAILED-TEST -->
<macrodef name="run-failed-test">
<attribute name="xmlfailedtestfile" />
<sequential>
<!-- Get the pipeline run type. Default to dry. -->
<condition property="pipeline.run" value="dry" else="${pipeline.run}">
<equals arg1="${pipeline.run}" arg2="$${pipeline.run}" />
</condition>
<condition property="cofoja.jvm.args" value="-javaagent:${cofoja.jar} -Dcom.google.java.contract.log.contract=false" else="">
<isset property="include.contracts" />
</condition>
<mkdir dir="${report}/failed_rerun" />
<echo message="Sting: Running @{xmlfailedtestfile} test cases!"/>
<taskdef resource="testngtasks" classpath="${lib.dir}/testng-5.14.1.jar"/>
<testng outputDir="${report}/failed_rerun"
haltOnFailure="false" failureProperty="test.failure"
verbose="2"
workingDir="${basedir}"
useDefaultListeners="false"
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter">
<jvmarg value="-Xmx${test.maxmemory}" />
<jvmarg value="-Djava.awt.headless=true" />
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
<jvmarg line="${cofoja.jvm.args}"/>
<!-- <jvmarg value="-Xdebug"/> -->
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.classes}" />
<pathelement location="${scala.classes}" />
<pathelement location="${java.contracts}" />
<pathelement location="${java.test.classes}" />
<pathelement location="${scala.test.classes}" />
</classpath>
<xmlfileset dir="${basedir}" includes="@{xmlfailedtestfile}" />
</testng>
<fail message="test failed" if="test.failure" />
</sequential>
</macrodef>
<!-- our three different test conditions: Test, IntegrationTest, PerformanceTest -->
<target name="test" depends="test.compile" description="Run unit tests">
<target name="alltests">
<antcall target="test" inheritAll="false"/>
<antcall target="integrationtest" inheritAll="false"/>
<antcall target="pipelinetest" inheritAll="false"/>
</target>
<target name="alltests.public">
<antcall target="test.public" inheritAll="false"/>
<antcall target="integrationtest.public" inheritAll="false"/>
<antcall target="pipelinetest.public" inheritAll="false"/>
</target>
<!-- Our four different test conditions: Test, IntegrationTest, PerformanceTest, PipelineTest -->
<target name="test" depends="init.buildall,test.compile" description="Run unit tests">
<condition property="ttype" value="*UnitTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${ttype}"/>
<run-test testtype="${ttype}" outputdir="${report}/${ttype}" runfailed="false"/>
</target>
<target name="integrationtest" depends="test.compile" description="Run integration tests">
<target name="test.public" depends="init.buildpublic,test"/>
<target name="integrationtest" depends="init.buildall,test.compile" description="Run integration tests">
<condition property="itype" value="*IntegrationTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${itype}"/>
<run-test testtype="${itype}" outputdir="${report}/${itype}" runfailed="false"/>
</target>
<target name="performancetest" depends="test.compile" description="Run performance tests">
<target name="integrationtest.public" depends="init.buildpublic,integrationtest"/>
<target name="performancetest" depends="init.buildall,test.compile" description="Run performance tests">
<condition property="ptype" value="*PerformanceTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${ptype}"/>
<run-test testtype="${ptype}" outputdir="${report}/${ptype}" runfailed="false"/>
</target>
<target name="pipelinetest" depends="test.compile" description="Run pipeline tests">
<target name="performancetest.public" depends="init.buildpublic,performancetest" />
<target name="pipelinetest" depends="init.buildall,test.compile" description="Run pipeline tests">
<condition property="pipetype" value="*PipelineTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${pipetype}"/>
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
</target>
<target name="pipelinetestrun" depends="test.compile" description="Run pipeline tests">
<target name="pipelinetest.public" depends="init.buildpublic,pipelinetest" />
<target name="pipelinetestrun" depends="init.buildall,test.compile" description="Run pipeline tests">
<property name="pipeline.run" value="run"/>
<condition property="pipetype" value="*PipelineTest" else="${single}">
<not><isset property="single"/></not>
</condition>
<run-test testtype="${pipetype}"/>
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
</target>
<target name="pipelinetestrun.public" depends="init.buildpublic,pipelinetestrun" />
<target name="failed-test" depends="init.buildall,test.compile">
<run-test testtype="${report}/*UnitTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<target name="failed-test" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*UnitTest/testng-failed.xml" />
<target name="failed-integration" depends="init.buildall,test.compile">
<run-test testtype="${report}/*IntegrationTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<target name="failed-integration" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*IntegrationTest/testng-failed.xml" />
<target name="failed-performance" depends="init.buildall,test.compile">
<run-test testtype="${report}/*PerformanceTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<target name="failed-performance" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*PerformanceTest/testng-failed.xml" />
</target>
<target name="failed-pipeline" depends="test.compile">
<run-failed-test xmlfailedtestfile="${report}/*PipelineTest/testng-failed.xml" />
<target name="failed-pipeline" depends="init.buildall,test.compile">
<run-test testtype="${report}/*PipelineTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
<!-- ******************************************************************************** -->

View File

@ -15,10 +15,8 @@
<!-- Tribble -->
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
<dependency org="log4j" name="log4j" rev="1.2.15">
<!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
<exclude org="javax.mail" />
</dependency>
<dependency org="log4j" name="log4j" rev="1.2.15"/>
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
<dependency org="colt" name="colt" rev="1.2.0"/>
<dependency org="jboss" name="javassist" rev="3.7.ga"/>
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>

View File

@ -12,14 +12,14 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
}
RUNTIME_UNITS = "(sec)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000
RUNTIME_UNITS = "(hours)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
#
# Helper function to aggregate all of the jobs in the report across all tables
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall) {
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
}
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
maxRelTime = max(allJobs$relDoneTime)
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + theme_bw()
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
if ( includeText )
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
p <- p + ylab("Job")
p <- p + ylab("Job number")
p <- p + opts(title=title)
print(p)
}
@ -140,6 +142,8 @@ print(paste("Project :", inputFileName))
convertUnits <- function(gatkReportData) {
convertGroup <- function(g) {
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
g
}
lapply(gatkReportData, convertGroup)
@ -155,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T)
plotJobsGantt(gatkReportData, F)
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotProgressByTime(gatkReportData)
for ( group in gatkReportData ) {
plotGroup(group)

View File

@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
if ( tribbleType == null )
if ( ! file.canRead() | !! file.isFile() ) {
if ( ! file.canRead() | ! file.isFile() ) {
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
} else {
throw new UserException.CommandLineException(

View File

@ -929,6 +929,14 @@ public class GenomeAnalysisEngine {
return readsDataSource.getHeader(reader);
}
/**
* Gets the master sequence dictionary for this GATK engine instance
* @return a never-null dictionary listing all of the contigs known to this engine instance
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return getReferenceDataSource().getReference().getSequenceDictionary();
}
/**
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads; header merger, individual file readers etc can be accessed through the returned data source object.

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
currentLocus = null;
continue;
}

View File

@ -215,6 +215,45 @@ public class GATKBAMIndex {
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
public long getStartOfLastLinearBin() {
openIndexFile();
seek(4);
final int sequenceCount = readInteger();
// Because no reads may align to the last sequence in the sequence dictionary,
// grab the last element of the linear index for each sequence, and return
// the last one from the last sequence that has one.
long lastLinearIndexPointer = -1;
for (int i = 0; i < sequenceCount; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j1 = 0; j1 < nBins; j1++) {
// Skip bin #
skipBytes(4);
final int nChunks = readInteger();
// Skip chunks
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
if (nLinearBins > 0) {
// Skip to last element of list of linear bins
skipBytes(8 * (nLinearBins - 1));
lastLinearIndexPointer = readLongs(1)[0];
}
}
closeIndexFile();
return lastLinearIndexPointer;
}
/**
* Gets the possible number of bins for a given reference sequence.
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.

View File

@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
// If the region contains location information (in other words, it is not at
// the start of the unmapped region), add the region.
if(currentFilePointer.isRegionUnmapped) {
// If the region is unmapped and no location data exists, add a null as an indicator to
// start at the next unmapped region.
if(!isIntoUnmappedRegion) {
selectedReaders.put(id,null);
isIntoUnmappedRegion = true;
}
else
selectedReaders.put(id,position.get(id));
}
else {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {

View File

@ -97,7 +97,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
if (!( walker instanceof TreeReducible ))
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
traversalEngine.startTimers();
ReduceTree reduceTree = new ReduceTree(this);
initializeWalker(walker);

View File

@ -44,7 +44,6 @@ public class LinearMicroScheduler extends MicroScheduler {
* @param shardStrategy A strategy for sharding the data.
*/
public Object execute(Walker walker, ShardStrategy shardStrategy) {
traversalEngine.startTimers();
walker.initialize();
Accumulator accumulator = Accumulator.create(engine,walker);
@ -54,6 +53,7 @@ public class LinearMicroScheduler extends MicroScheduler {
if ( done || shard == null ) // we ran out of shards that aren't owned
break;
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
LocusWalker lWalker = (LocusWalker)walker;
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata());

View File

@ -57,6 +57,7 @@ public class ShardTraverser implements Callable {
public Object call() {
try {
traversalEngine.startTimersIfNecessary();
long startTime = System.currentTimeMillis();
Object accumulator = walker.reduceInit();

View File

@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
* @version 0.1
*/
public class PlatformFilter extends ReadFilter {
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false)
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false)
protected String[] PLFilterNames;
public boolean filterOut(SAMRecord rec) {

View File

@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
else if ( stub.getOutputStream() != null ) {
this.file = null;
this.stream = stub.getOutputStream();
writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
}
else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
}
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.io.stubs;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
return isCompressed;
}
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return engine.getMasterSequenceDictionary();
}
/**
* Should we tell the VCF writer not to write genotypes?
* @return true if the writer should not write genotypes.

View File

@ -293,15 +293,16 @@ public class GATKRunReport {
* That is, postReport() is guarenteed not to fail for any reason.
*/
private File postReportToLocalDisk(File rootDir) {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
try {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
postReportToFile(file);
logger.debug("Wrote report to " + file);
return file;
} catch ( Exception e ) {
// we catch everything, and no matter what eat the error
exceptDuringRunReport("Couldn't read report file", e);
file.delete();
return null;
}
}
@ -312,6 +313,7 @@ public class GATKRunReport {
File localFile = postReportToLocalDisk(new File("./"));
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
if ( localFile != null ) { // we succeeded in creating the local file
localFile.deleteOnExit();
try {
// stop us from printing the annoying, and meaningless, mime types warning
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
@ -336,14 +338,13 @@ public class GATKRunReport {
//logger.info("Uploading " + localFile + " to AWS bucket");
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
logger.debug("Uploaded to AWS: " + s3Object);
logger.info("Uploaded run statistics report to AWS S3");
} catch ( S3ServiceException e ) {
exceptDuringRunReport("S3 exception occurred", e);
} catch ( NoSuchAlgorithmException e ) {
exceptDuringRunReport("Couldn't calculate MD5", e);
} catch ( IOException e ) {
exceptDuringRunReport("Couldn't read report file", e);
} finally {
localFile.delete();
}
}
}

View File

@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram {
Index index = IndexFactory.createIndex(inputFileSource, codec, approach);
// add writing of the sequence dictionary, if supplied
builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false);
builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary());
// create the output stream, and write the index
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.index.Index;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* Utilities for working with Sequence Dictionaries embedded in tribble indices
*
* @author Your Name
* @since Date created
*/
public class IndexDictionaryUtils {
private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class);
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
for ( SAMSequenceRecord seq : dict.getSequences() ) {
final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName();
final String length = String.valueOf(seq.getSequenceLength());
index.addProperty(contig,length);
}
}
public static void validateTrackSequenceDictionary(final String trackName,
final SAMSequenceDictionary trackDict,
final SAMSequenceDictionary referenceDict,
final ValidationExclusion.TYPE validationExclusionType ) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0)
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else {
Set<String> trackSequences = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
trackSequences.add(dictionaryEntry.getSequenceName());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
}
}
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureSource;
@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
*
* @author aaron
*
* @author aaron
* `
* Class RMDTrackBuilder
*
@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false;
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
// private sequence dictionary we use to set our tracks with
private SAMSequenceDictionary dict = null;
@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); }
catch (ReviewedStingException e) { }
sequenceDictionary = getSequenceDictionaryFromProperties(index);
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (sequenceDictionary.size() == 0 && dict != null) {
File indexFile = Tribble.indexFile(inputFile);
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
sequenceDictionary = getSequenceDictionaryFromProperties(index);
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
try { // re-write the index
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
}
if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE )
@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
// this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile);
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict);
return idx;
}
// ---------------------------------------------------------------------------------------------------------
// static functions to work with the sequence dictionaries of indexes
// ---------------------------------------------------------------------------------------------------------
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
/**
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
* (that each contig in the index is in the sequence dictionary).
* @param inputFile for proper error message formatting.
* @param dict the sequence dictionary
* @param index the index file
* @param indexFile the index file
* @param rewriteIndex should we rewrite the index when we're done?
*
*/
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
if (dict == null) return;
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) {
if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null");
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
for (SAMSequenceRecord seq : currentDict.getSequences()) {
if (dict.getSequence(seq.getSequenceName()) == null)
continue;
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
}
// re-write the index
if (rewriteIndex) try {
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict);
// actually update the dictionary in the index
IndexDictionaryUtils.setIndexSequenceDictionary(index, dict);
}
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0)
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else {
Set<String> trackSequences = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
trackSequences.add(dictionaryEntry.getSequenceName());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
}
public void validateTrackSequenceDictionary(final String trackName,
final SAMSequenceDictionary trackDict,
final SAMSequenceDictionary referenceDict ) {
IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType);
}
}

View File

@ -115,12 +115,13 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
LinkedList<ProcessingHistory> history = new LinkedList<ProcessingHistory>();
/** We use the SimpleTimer to time our run */
private SimpleTimer timer = new SimpleTimer("Traversal");
private SimpleTimer timer = null;
// How long can we go without printing some progress info?
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
private int printProgressCheckCounter = 0;
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
@ -209,11 +210,16 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
}
}
/**
* Should be called to indicate that we're going to process records and the timer should start ticking
* Should be called to indicate that we're going to process records and the timer should start ticking. This
* function should be called right before any traversal work is done, to avoid counting setup costs in the
* processing costs and inflating the estimated runtime.
*/
public void startTimers() {
timer.start();
lastProgressPrintTime = timer.currentTime();
public void startTimersIfNecessary() {
if ( timer == null ) {
timer = new SimpleTimer("Traversal");
timer.start();
lastProgressPrintTime = timer.currentTime();
}
}
/**
@ -224,7 +230,8 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @return true if the maximum interval (in millisecs) has passed since the last printing
*/
private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) {
return (curTime - lastPrintTime) > printFreq;
long elapsed = curTime - lastPrintTime;
return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS;
}
/**
@ -351,7 +358,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
public void printOnTraversalDone() {
printProgress(null, null, true);
final double elapsed = timer.getElapsedTime();
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics();

View File

@ -26,21 +26,23 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import java.io.PrintStream;
import java.util.Iterator;
/**
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
* of the given ROD.
*/
public class PrintRODsWalker extends RodWalker<Integer, Integer> {
@Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true)
public RodBinding<Feature> input;
@Output
PrintStream out;
@ -62,7 +64,7 @@ public class PrintRODsWalker extends RodWalker<Integer, Integer> {
if ( tracker == null )
return 0;
for ( Feature feature : tracker.getValues(Feature.class) ) {
for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) {
out.println(feature.toString());
}

View File

@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
* -I input1.bam \
* -I input2.bam \
* --read_filter MappingQualityZero
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T PrintReads \
* -o output.bam \
* -I input.bam \
* -n 2000
* </pre>
*
*/

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -77,6 +78,15 @@ public abstract class Walker<MapType, ReduceType> {
return toolkit;
}
/**
* Gets the master sequence dictionary for this walker
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
*/
protected SAMSequenceDictionary getMasterSequenceDictionary() {
return getToolkit().getMasterSequenceDictionary();
}
/**
* (conceptual static) method that states whether you want to see reads piling up at a locus
* that contain a deletion at the locus.

View File

@ -26,7 +26,7 @@ public class SBByDepth extends AnnotationByDepth {
if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY))
return null;
double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY));
double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1);
final Map<String, Genotype> genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() == 0 )

View File

@ -24,7 +24,9 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -46,134 +45,429 @@ import java.util.*;
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
* output file (which must be provided on the command line via --snpEffFile:SnpEff <filename>),
* output file (which must be provided on the command line via --snpEffFile filename.vcf),
* and adds annotations on that effect.
*
* The possible biological effects and their associated impacts are defined in the class:
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
*
* @author David Roazen
*/
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
// SnpEff annotation key names:
public static final String GENE_ID_KEY = "GENE_ID";
public static final String GENE_NAME_KEY = "GENE_NAME";
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
public static final String EXON_ID_KEY = "EXON_ID";
public static final String EXON_RANK_KEY = "EXON_RANK";
public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
public static final String EFFECT_KEY = "EFFECT";
public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
public static final String CODON_NUM_KEY = "CODON_NUM";
public static final String CDS_SIZE_KEY = "CDS_SIZE";
private static Logger logger = Logger.getLogger(SnpEff.class);
// We refuse to parse SnpEff output files generated by unsupported versions, or
// lacking a SnpEff version number in the VCF header:
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
// SnpEff aggregates all effects (and effect metadata) together into a single INFO
// field annotation with the key EFF:
public static final String SNPEFF_INFO_FIELD_KEY = "EFF";
public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]";
public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|";
// Key names for the INFO field annotations we will add to each record, along
// with parsing-related information:
public enum InfoFieldKey {
EFFECT_KEY ("SNPEFF_EFFECT", -1),
IMPACT_KEY ("SNPEFF_IMPACT", 0),
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
EXON_ID_KEY ("SNPEFF_EXON_ID", 7);
// Actual text of the key
private final String keyName;
// Index within the effect metadata subfields from the SnpEff EFF annotation
// where each key's associated value can be found during parsing.
private final int fieldIndex;
InfoFieldKey ( String keyName, int fieldIndex ) {
this.keyName = keyName;
this.fieldIndex = fieldIndex;
}
public String getKeyName() {
return keyName;
}
public int getFieldIndex() {
return fieldIndex;
}
}
// Possible SnpEff biological effects. All effect names found in the SnpEff input file
// are validated against this list.
public enum EffectType {
NONE,
CHROMOSOME,
INTERGENIC,
UPSTREAM,
UTR_5_PRIME,
UTR_5_DELETED,
START_GAINED,
SPLICE_SITE_ACCEPTOR,
SPLICE_SITE_DONOR,
START_LOST,
SYNONYMOUS_START,
NON_SYNONYMOUS_START,
CDS,
GENE,
TRANSCRIPT,
EXON,
EXON_DELETED,
NON_SYNONYMOUS_CODING,
SYNONYMOUS_CODING,
FRAME_SHIFT,
CODON_CHANGE,
CODON_INSERTION,
CODON_CHANGE_PLUS_CODON_INSERTION,
CODON_DELETION,
CODON_CHANGE_PLUS_CODON_DELETION,
STOP_GAINED,
SYNONYMOUS_STOP,
NON_SYNONYMOUS_STOP,
STOP_LOST,
INTRON,
UTR_3_PRIME,
UTR_3_DELETED,
DOWNSTREAM,
INTRON_CONSERVED,
INTERGENIC_CONSERVED,
REGULATION,
CUSTOM,
WITHIN_NON_CODING_GENE
}
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact.
public enum EffectImpact {
LOW (1),
MODERATE (2),
HIGH (3);
private final int severityRating;
EffectImpact ( int severityRating ) {
this.severityRating = severityRating;
}
public boolean isHigherImpactThan ( EffectImpact other ) {
return this.severityRating > other.severityRating;
}
}
// SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information.
public enum EffectCoding {
CODING,
NON_CODING,
UNKNOWN
}
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
validateRodBinding(walker.getSnpEffRodBinding());
checkSnpEffVersion(walker, toolkit);
}
public Map<String, Object> annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
RodBinding<SnpEffFeature> snpEffRodBinding = walker.getSnpEffRodBinding();
validateRodBinding(snpEffRodBinding);
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
List<SnpEffFeature> features = tracker.getValues(snpEffRodBinding, ref.getLocus());
// Get only SnpEff records that start at this locus, not merely span it:
List<VariantContext> snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus());
// Add only annotations for one of the most biologically-significant effects as defined in
// the SnpEffConstants class:
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features);
if ( mostSignificantEffect == null ) {
// Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate.
// If there is more than one such record, we only need to pick the first one, since the biological
// effects will be the same across all such records:
VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc);
if ( matchingRecord == null ) {
return null;
}
return generateAnnotations(mostSignificantEffect);
// Parse the SnpEff INFO field annotation from the matching record into individual effect objects:
List<SnpEffEffect> effects = parseSnpEffRecord(matchingRecord);
if ( effects.size() == 0 ) {
return null;
}
// Add only annotations for one of the most biologically-significant effects from this set:
SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects);
return mostSignificantEffect.getAnnotations();
}
private void validateRodBinding ( RodBinding<SnpEffFeature> snpEffRodBinding ) {
private void validateRodBinding ( RodBinding<VariantContext> snpEffRodBinding ) {
if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) {
throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " +
"as a rodbinding on the command line, but no SnpEff rodbinding was found.");
throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " +
"as a rodbinding on the command line via the --snpEffFile option, but " +
"no SnpEff rodbinding was found.");
}
}
private SnpEffFeature getMostSignificantEffect ( List<SnpEffFeature> snpEffFeatures ) {
SnpEffFeature mostSignificantEffect = null;
private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
RodBinding<VariantContext> snpEffRodBinding = walker.getSnpEffRodBinding();
for ( SnpEffFeature snpEffFeature : snpEffFeatures ) {
VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName());
VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY);
if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) {
throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " +
"input file, and so could not verify that the file was generated by a supported version of SnpEff (" +
Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
}
String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0];
if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) {
throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " +
"is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS));
}
}
private boolean isSupportedSnpEffVersion ( String versionString ) {
for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) {
if ( supportedVersion.equals(versionString) ) {
return true;
}
}
return false;
}
private VariantContext getMatchingSnpEffRecord ( List<VariantContext> snpEffRecords, VariantContext vc ) {
for ( VariantContext snpEffRecord : snpEffRecords ) {
if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) {
return snpEffRecord;
}
}
return null;
}
private List<SnpEffEffect> parseSnpEffRecord ( VariantContext snpEffRecord ) {
List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>();
Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
List<String> individualEffects;
// The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a String.
// We can have either in the case of SnpEff, since there may be one or more than one effect in this record.
if ( effectFieldValue instanceof List ) {
individualEffects = (List<String>)effectFieldValue;
}
else {
individualEffects = Arrays.asList((String)effectFieldValue);
}
for ( String effectString : individualEffects ) {
String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);
if ( effectNameAndMetadata.length != 2 ) {
logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s",
snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
continue;
}
String effectName = effectNameAndMetadata[0];
String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);
SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);
if ( parsedEffect.isWellFormed() ) {
parsedEffects.add(parsedEffect);
}
else {
logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString));
}
}
return parsedEffects;
}
private SnpEffEffect getMostSignificantEffect ( List<SnpEffEffect> effects ) {
SnpEffEffect mostSignificantEffect = null;
for ( SnpEffEffect effect : effects ) {
if ( mostSignificantEffect == null ||
snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
effect.isHigherImpactThan(mostSignificantEffect) ) {
mostSignificantEffect = snpEffFeature;
mostSignificantEffect = effect;
}
}
return mostSignificantEffect;
}
private Map<String, Object> generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(getKeyNames().size()));
if ( mostSignificantEffect.hasGeneID() )
annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
if ( mostSignificantEffect.hasGeneName() )
annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
if ( mostSignificantEffect.hasTranscriptID() )
annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
if ( mostSignificantEffect.hasExonID() )
annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
if ( mostSignificantEffect.hasExonRank() )
annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
if ( mostSignificantEffect.isNonCodingGene() )
annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
if ( mostSignificantEffect.hasEffectExtraInformation() )
annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
if ( mostSignificantEffect.hasOldAndNewAA() )
annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
if ( mostSignificantEffect.hasOldAndNewCodon() )
annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
if ( mostSignificantEffect.hasCodonNum() )
annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
if ( mostSignificantEffect.hasCdsSize() )
annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
return annotations;
}
public List<String> getKeyNames() {
return Arrays.asList( GENE_ID_KEY,
GENE_NAME_KEY,
TRANSCRIPT_ID_KEY,
EXON_ID_KEY,
EXON_RANK_KEY,
WITHIN_NON_CODING_GENE_KEY,
EFFECT_KEY,
EFFECT_IMPACT_KEY,
EFFECT_EXTRA_INFORMATION_KEY,
OLD_NEW_AA_KEY,
OLD_NEW_CODON_KEY,
CODON_NUM_KEY,
CDS_SIZE_KEY
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
InfoFieldKey.IMPACT_KEY.getKeyName(),
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
InfoFieldKey.EXON_ID_KEY.getKeyName()
);
}
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
);
}
/**
* Helper class to parse, validate, and store a single SnpEff effect and its metadata.
*/
protected static class SnpEffEffect {
private EffectType effect;
private EffectImpact impact;
private String codonChange;
private String aminoAcidChange;
private String geneName;
private String geneBiotype;
private EffectCoding coding;
private String transcriptID;
private String exonID;
private String parseError = null;
private boolean isWellFormed = true;
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
// Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
// errors come after warnings, not vice versa:
private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
private static final int SNPEFF_CODING_FIELD_INDEX = 5;
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
parseEffectName(effectName);
parseEffectMetadata(effectMetadata);
}
private void parseEffectName ( String effectName ) {
try {
effect = EffectType.valueOf(effectName);
}
catch ( IllegalArgumentException e ) {
parseError(String.format("%s is not a recognized effect type", effectName));
}
}
private void parseEffectMetadata ( String[] effectMetadata ) {
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
}
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
}
else {
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length));
}
return;
}
try {
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
}
catch ( IllegalArgumentException e ) {
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
}
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
try {
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
}
catch ( IllegalArgumentException e ) {
parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX]));
}
}
else {
coding = EffectCoding.UNKNOWN;
}
transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
}
private void parseError ( String message ) {
isWellFormed = false;
// Cache only the first error encountered:
if ( parseError == null ) {
parseError = message;
}
}
public boolean isWellFormed() {
return isWellFormed;
}
public String getParseError() {
return parseError == null ? "" : parseError;
}
public boolean isCoding() {
return coding == EffectCoding.CODING;
}
public boolean isHigherImpactThan ( SnpEffEffect other ) {
// If one effect is within a coding gene and the other is not, the effect that is
// within the coding gene has higher impact:
if ( isCoding() && ! other.isCoding() ) {
return true;
}
else if ( ! isCoding() && other.isCoding() ) {
return false;
}
// Otherwise, both effects are either in or not in a coding gene, so we compare the impacts
// of the effects themselves:
return impact.isHigherImpactThan(other.impact);
}
public Map<String, Object> getAnnotations() {
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(InfoFieldKey.values().length));
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
return annotations;
}
private void addAnnotation ( Map<String, Object> annotations, String keyName, String keyValue ) {
// Only add annotations for keys associated with non-empty values:
if ( keyValue != null && keyValue.trim().length() > 0 ) {
annotations.put(keyName, keyValue);
}
}
}
}

View File

@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -86,14 +85,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
/**
* The INFO field will be annotated with information on the most biologically-significant effect
* listed in the SnpEff output file for each variant.
*/
@Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
public RodBinding<SnpEffFeature> snpEffFile;
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
public RodBinding<VariantContext> snpEffFile;
public RodBinding<VariantContext> getSnpEffRodBinding() { return snpEffFile; }
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
@ -203,11 +203,13 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
}
if ( USE_ALL_ANNOTATIONS )
engine = new VariantAnnotatorEngine(this);
engine = new VariantAnnotatorEngine(this, getToolkit());
else
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit());
engine.initializeExpressions(expressionsToUse);
engine.invokeAnnotationInitializationMethods();
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();

View File

@ -26,13 +26,11 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -49,6 +47,7 @@ public class VariantAnnotatorEngine {
private HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
private AnnotatorCompatibleWalker walker;
private GenomeAnalysisEngine toolkit;
private static class VAExpression {
@ -74,16 +73,18 @@ public class VariantAnnotatorEngine {
}
// use this constructor if you want all possible annotations
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) {
public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
this.walker = walker;
this.toolkit = toolkit;
requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
initializeDBs();
}
// use this constructor if you want to select specific annotations (and/or interfaces)
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker) {
public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
this.walker = walker;
this.toolkit = toolkit;
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
initializeDBs();
}
@ -113,6 +114,16 @@ public class VariantAnnotatorEngine {
dbAnnotations.put(rod, rod.getName());
}
public void invokeAnnotationInitializationMethods() {
for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
annotation.initialize(walker, toolkit);
}
for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.initialize(walker, toolkit);
}
}
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
@ -9,8 +8,9 @@ import java.util.List;
public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings
public abstract RodBinding<SnpEffFeature> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getVariantRodBinding();
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
}
}

View File

@ -24,18 +24,16 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
import java.util.Map;
@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
public abstract class VariantAnnotatorAnnotation {
// return the INFO keys
public abstract List<String> getKeyNames();
// initialization method (optional for subclasses, and therefore non-abstract)
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { }
}

View File

@ -79,24 +79,45 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* If this argument is present, the original allele frequencies and counts from this vcf are added as annotations ACH,AFH and ANH. at each record present in this vcf
*/
@Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
public RodBinding<VariantContext> comp;
/**
* This required argument is used to annotate each site in the vcf INFO field with R2 annotation. Will be NaN if Beagle determined there are no variant samples.
*/
@Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true)
public RodBinding<BeagleFeature> beagleR2;
/**
* These values will populate the GL field for each sample and contain the posterior probability of each genotype given the data after phasing and imputation.
*/
@Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true)
public RodBinding<BeagleFeature> beagleProbs;
/**
* By default, all genotypes will be marked in the VCF as "phased", using the "|" separator after Beagle.
*/
@Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true)
public RodBinding<BeagleFeature> beaglePhased;
@Output(doc="VCF File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
/**
* If this argument is absent, and if Beagle determines that there is no sample in a site that has a variant genotype, the site will be marked as filtered (Default behavior).
* If the argument is present, the site won't be marked as filtered under this condition even if there are no variant genotypes.
*/
@Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false)
public boolean DONT_FILTER_MONOMORPHIC_SITES = false;
/**
* Value between 0 and 1. If the probability of getting a genotype correctly (based on the posterior genotype probabilities and the actual genotype) is below this threshold,
* a genotype will be substitute by a no-call.
*/
@Argument(fullName="no" +
"call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false)
private double noCallThreshold = 0.0;
@ -154,21 +175,16 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
}
BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2);
// ignore places where we don't have a variant
if ( beagleR2Feature == null )
return 0;
BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs);
// ignore places where we don't have a variant
if ( beagleProbsFeature == null )
return 0;
BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased);
// ignore places where we don't have a variant
if ( beaglePhasedFeature == null )
return 0;
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
{
vcfWriter.add(vc_input);
return 1;
}
// get reference base for current position
byte refByte = ref.getBase();

View File

@ -112,6 +112,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
VCFWriter bootstrapVCFOutput = null;
/**
* If sample gender is known, this flag should be set to true to ensure that Beagle treats male Chr X properly.
*/
@Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false)
public boolean CHECK_IS_MALE_ON_CHR_X = false;

View File

@ -63,20 +63,32 @@ import java.util.*;
* <h2>Input</h2>
* <p>
* One or more bam files (with proper headers) to be analyzed for coverage statistics
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
* </p>
*
* <p>
*(Optional) A REFSEQ Rod to aggregate coverage to the gene level
* <p>
* (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
*</p></p>
* <h2>Output</h2>
* <p>
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
* </p><p>
* - no suffix: per locus coverage
* </p><p>
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
* </p><p>
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
* </p><p>
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
* </p><p>
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
* </p><p>
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
* </p><p>
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
* </p><p>
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
* </p><p>
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
* </p>
*
@ -84,7 +96,7 @@ import java.util.*;
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantEval \
* -T DepthOfCoverage \
* -o file_name_base \
* -I input_bams.list
* [-geneList refSeq.sorted.txt] \

View File

@ -43,8 +43,10 @@ import java.util.List;
* Generates an alternative reference sequence over the specified interval.
*
* <p>
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
* Note that if there are multiple variants at a site, it takes the first one seen.
* Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
*
* <h2>Input</h2>
* <p>

View File

@ -42,6 +42,9 @@ import java.io.PrintStream;
*
* <p>
* The output format can be partially controlled using the provided command-line arguments.
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
* separate fasta sequence (named numerically in order).
*
* <h2>Input</h2>
* <p>

View File

@ -276,8 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
if ( elt.isReducedRead() ) {
// reduced read representation
byte qual = elt.getReducedQual();
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
return elt.getReducedCount(); // we added nObs bases here
if ( BaseUtils.isRegularBase( elt.getBase() )) {
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
return elt.getReducedCount(); // we added nObs bases here
} else // odd bases or deletions => don't use them
return 0;
} else {
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;

View File

@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private boolean SIMPLE_GREEDY_GENOTYPER = false;
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
private static final double[][] getGLs(Map<String, Genotype> GLs) {
double[][] genotypeLikelihoods = new double[GLs.size()+1][];
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
int j = 0;
//int j = 0;
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.values() ) {
j++;
if ( sample.hasLikelihoods() ) {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
// -------------------------------------------------------------------------------------
//
// Linearized, ~O(N), implementation.
@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
final int numSamples = GLs.size();
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final double[][] genotypeLikelihoods = getGLs(GLs);
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods[j];
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
boolean multiAllelicRecord = false;
if (vc.getAlternateAlleles().size() > 1)
multiAllelicRecord = true;
Map<String, Genotype> GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.keySet());
sampleIdx = GLs.size();
}
@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
continue;
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
//System.out.print(sample.getKey()+":");
//for (int k=0; k < likelihoods.length; k++)
// System.out.format("%4.2f ",likelihoods[k]);
//System.out.println();
// all likelihoods are essentially the same: skip this sample and will later on force no call.
//sampleIdx++;
continue;
}
sampleIndices.add(sample.getKey());
for (int k=0; k <= AFofMaxLikelihood; k++) {
@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
else {
int newIdx = tracebackArray[k][startIdx];
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
// and will add no-call genotype to GL's in a second pass
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Double.NEGATIVE_INFINITY;
double[] likelihoods = g.getLikelihoods().getAsVector();
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
}
else {
int newIdx = tracebackArray[k][startIdx];;
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
/* System.out.format("Sample: %s GL:",sample);
for (int i=0; i < likelihoods.length; i++)
System.out.format("%1.4f ",likelihoods[i]);
System.out.format("%1.4f, ",likelihoods[i]);
*/
for (int i=0; i < likelihoods.length; i++) {
@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
if ( !sample.getValue().hasLikelihoods() )
continue;
Genotype g = GLs.get(sample.getKey());
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
continue; // regular likelihoods
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Genotype.NO_NEG_LOG_10PERROR;
myAlleles.add(Allele.NO_CALL);
myAlleles.add(Allele.NO_CALL);
//System.out.println(myAlleles.toString());
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
}
return calls;
}

View File

@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
@ -321,7 +323,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
haplotypeMap.clear();
if (getAlleleListFromVCF) {
for( final VariantContext vc_input : tracker.getValues(UAC.alleles) ) {
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
if( vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (pileup != null ) {
double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
// for 3 alleles it's 00 01 11 02 12 22
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,
genotypeLikelihoods,
getFilteredDepth(pileup)));
@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get();
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
count++;
}
return count;
}
}

View File

@ -26,14 +26,12 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -58,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
}
public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
if ( tracker == null || ref == null || logger == null )
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
VariantContext vc = null;
// search for usable record
for( final VariantContext vc_input : tracker.getValues(allelesBinding) ) {
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
if ( vc == null ) {
vc = vc_input;
} else {
logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record");
}
}
}
return vc;
}
public Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
@ -96,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( alternateAlleleToUse != null ) {
bestAlternateAllele = alternateAlleleToUse.getBases()[0];
} else if ( useAlleleFromVCF ) {
VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger, UAC.alleles);
VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a variant
if ( vc == null )

View File

@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;

View File

@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -127,7 +126,8 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
@ -210,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this);
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, this, getToolkit());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
// initialize the header

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
@ -36,13 +37,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
@ -236,10 +235,11 @@ public class UnifiedGenotyperEngine {
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
VariantContext vc;
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles);
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
if ( vcInput == null )
return null;
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles());
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase());
} else {
// deal with bad/non-standard reference bases
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
@ -635,7 +635,7 @@ public class UnifiedGenotyperEngine {
// no extended event pileup
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger, UAC.alleles);
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
if (vcInput == null)
return null;
@ -741,4 +741,23 @@ public class UnifiedGenotyperEngine {
return afcm;
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
if ( tracker == null || ref == null || logger == null )
throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger);
VariantContext vc = null;
// search for usable record
for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) {
if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) {
if ( vc == null ) {
vc = vc_input;
} else {
logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record");
}
}
}
return vc;
}
}

View File

@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel {
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
double baseProb = QualityUtils.qualToProb(k);
double baseProb = QualityUtils.qualToProb((byte)k);
baseMatchArray[k] = probToQual(baseProb);

View File

@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.*;
/**
* Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
* but this latter functionality is now superceded by UnifiedGenotyper.
*
* <p>
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
* data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
* and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
* or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
* the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
* metrics for the post-processing tools to make the final decision). The calls are performed by default
* from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
* command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
* respectively. Indels are called from the tumor sample and annotated as germline
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
* if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
* is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
* in tumor sample, it will not be reported.
*
* <b>If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
* please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.</b>
* To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
*
* <h2>Input</h2>
* <p>
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
* </p>
*
* <h2>Output</h2>
* <p>
* Indel calls with associated metrics.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SomaticIndelDetector \
* -o indels.vcf \
* -verbose indels.txt
* -I:normal normal.bam \
* -I:tumor tumor.bam
* </pre>
*
*/
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
// @Output
// PrintStream out;
@Output(doc="File to which variants should be written",required=true)
@Output(doc="File to write variants (indels) in VCF format",required=true)
protected VCFWriter vcf_writer = null;
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
@Hidden
@Argument(fullName = "genotype_intervals", shortName = "genotype",
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
public String genotypeIntervalsFile = null;
@Hidden
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
"if the list turns out to be unsorted, it will throw an exception. "+
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
"to sort and keep it in memory (increases memory usage!).")
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
"if the list turns out to be unsorted, it will throw an exception. "+
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
"to sort and keep it in memory (increases memory usage!).")
protected boolean GENOTYPE_NOT_SORTED = false;
@Hidden
@Argument(fullName="unpaired", shortName="unpaired",
doc="Perform unpaired calls (no somatic status detection)", required=false)
@Argument(fullName="unpaired", shortName="unpaired",
doc="Perform unpaired calls (no somatic status detection)", required=false)
boolean call_unpaired = false;
boolean call_somatic ;
boolean call_somatic ;
@Argument(fullName="verboseOutput", shortName="verbose",
doc="Verbose output file in text format", required=false)
java.io.File verboseOutput = null;
@Argument(fullName="verboseOutput", shortName="verbose",
doc="Verbose output file in text format", required=false)
java.io.File verboseOutput = null;
@Argument(fullName="bedOutput", shortName="bed",
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
java.io.File bedOutput = null;
@Argument(fullName="minCoverage", shortName="minCoverage",
doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
int minCoverage = 6;
@Argument(fullName="minCoverage", shortName="minCoverage",
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
int minCoverage = 6;
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
int minNormalCoverage = 4;
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
int minNormalCoverage = 4;
@Argument(fullName="minFraction", shortName="minFraction",
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
double minFraction = 0.3;
@Argument(fullName="minFraction", shortName="minFraction",
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
double minFraction = 0.3;
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
double minConsensusFraction = 0.7;
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
"all indel observations at the site exceeds this threshold", required=false)
double minConsensusFraction = 0.7;
@Argument(fullName="minIndelCount", shortName="minCnt",
doc="Minimum count of reads supporting consensus indel required for making the call. "+
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
"(minIndelCount not met) will not pass.", required=false)
int minIndelCount = 0;
@Argument(fullName="minIndelCount", shortName="minCnt",
doc="Minimum count of reads supporting consensus indel required for making the call. "+
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
"(minIndelCount not met) will not pass.", required=false)
int minIndelCount = 0;
@Argument(fullName="refseq", shortName="refseq",
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
String RefseqFileName = null;
@Argument(fullName="refseq", shortName="refseq",
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
String RefseqFileName = null;
@Argument(fullName="blacklistedLanes", shortName="BL",
doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
"by this application, so they will not contribute indels to consider and will not be counted.", required=false)
PlatformUnitFilterHelper dummy;
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
//@Argument(fullName="blacklistedLanes", shortName="BL",
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
//PlatformUnitFilterHelper dummy;
@Hidden
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
required=false) Boolean DEBUG = false;
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
"May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
"May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
"window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
"than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
"if window is too small",required=false) int WINDOW_SIZE = 200;
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
private WindowContext tumor_context;
private WindowContext normal_context;
private int currentContigIndex = -1;

View File

@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray {
public PhasingRead(int length, int mappingQual) {
super(length);
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual));
this.baseProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseProbs, null);

View File

@ -44,12 +44,12 @@ public class RefSeqDataParser {
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
Map<String, String> entriesToNames = new HashMap<String, String>();
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
if (numRecords != null) {
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
if (numRecords != -1) {
boolean done = false;
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
done = true;
@ -59,14 +59,14 @@ public class RefSeqDataParser {
if (!done) {
for (int i = 1; i <= numRecords; i++) {
String key = nameKeyToUseMultiplePrefix + i;
String name = vc.getAttributeAsStringNoException(key);
String name = vc.getAttributeAsString(key, null);
if (name != null)
entriesToNames.put(key, name);
}
}
}
else { // no entry with the # of records:
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
}

View File

@ -76,6 +76,42 @@ import java.util.Map;
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
*
* The first 20 lines of such a file is shown below.
* * The file begins with a series of comment lines describing:
* ** The number of counted loci
* ** The number of counted bases
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
*
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
*
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*
* <pre>
* # Counted Sites 19451059
* # Counted Bases 56582018
* # Skipped Sites 82666
* # Fraction Skipped 1 / 235 bp
* ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
* SRR006446,11,65,CA,9,1,10
* SRR006446,11,48,TA,10,0,40
* SRR006446,11,67,AA,27,0,40
* SRR006446,11,61,GA,11,1,10
* SRR006446,12,34,CA,47,1,17
* SRR006446,12,30,GA,52,1,17
* SRR006446,12,36,AA,352,1,25
* SRR006446,12,17,TA,182,11,12
* SRR006446,11,48,TG,2,0,40
* SRR006446,11,67,AG,1,0,40
* SRR006446,12,34,CG,9,0,40
* SRR006446,12,30,GG,43,0,40
* ERR001876,4,31,AG,1,0,40
* ERR001876,4,31,AT,2,2,1
* ERR001876,4,31,CA,1,0,40
* </pre>
* </p>
*
* <h2>Examples</h2>

View File

@ -61,7 +61,7 @@ import java.util.List;
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
*</pre>
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
*
*<pre>
* Valid // amplicon is valid
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
@ -72,10 +72,10 @@ import java.util.List;
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
* </p>
* </pre></p>
*
* <h2>Examples</h2>
* <pre></pre>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ValidationAmplicons

View File

@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.*;
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche;
@ -24,6 +25,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -53,7 +55,23 @@ import java.util.*;
*
* <h2>Output</h2>
* <p>
* Evaluation tables.
* Evaluation tables detailing the results of the eval modules which were applied.
* For example:
* <pre>
* output.eval.gatkreport:
* ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
* CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
* CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
* CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
* CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
* CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
* CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
* CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
* CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
* CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
* CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
* ...
* </pre>
* </p>
*
* <h2>Examples</h2>
@ -147,12 +165,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false)
private String TRANCHE_FILENAME = null;
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
private File ancestralAlignmentsFile = null;
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
private boolean requireStrictAlleleMatch = false;
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
@ -230,16 +248,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
jexlExpressions.add(sjexl);
}
// Add select expressions for anything in the tranches file
if ( TRANCHE_FILENAME != null ) {
// we are going to build a few select names automatically from the tranches file
for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) {
logger.info("Adding select for all variant above the pCut of : " + t);
SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod));
SELECT_NAMES.add(String.format("TS-%.2f", t.ts));
}
}
// Initialize the set of stratifications and evaluations to use
stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
@ -371,16 +379,16 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if ( matchingComps.size() == 0 )
return null;
// find the comp which matches the alternate allele from eval
// find the comp which matches both the reference allele and alternate allele from eval
Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
for ( VariantContext comp : matchingComps ) {
Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) )
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) )
return comp;
}
// if none match, just return the first one
return matchingComps.get(0);
// if none match, just return the first one unless we require a strict match
return (requireStrictAlleleMatch ? null : matchingComps.get(0));
}
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }

View File

@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
@DataPoint(description = "number of eval SNP sites")
long nEvalVariants = 0;
@DataPoint(description = "number of comp SNP sites")
long nCompVariants = 0;
@DataPoint(description = "number of eval sites outside of comp sites")
long novelSites = 0;
@ -75,10 +72,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
boolean evalIsGood = eval != null && eval.isVariant();
boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType());
boolean evalIsGood = eval != null && eval.isPolymorphic();
boolean compIsGood = comp != null && comp.isNotFiltered();
if (compIsGood) nCompVariants++; // count the number of comp events
if (evalIsGood) nEvalVariants++; // count the number of eval events
if (compIsGood && evalIsGood) {

View File

@ -100,21 +100,22 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
// added in a proxy check for monomorphic status here.
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) {
if ( vc1.isMonomorphic() ) {
nRefLoci++;
} else {
switch (vc1.getType()) {
case NO_VARIATION:
// shouldn't get here
break;
case SNP:
nVariantLoci++;
nSNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case MNP:
nVariantLoci++;
nMNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case INDEL:
nVariantLoci++;
@ -136,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
String refStr = vc1.getReference().getBaseString().toUpperCase();
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null;
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null;
// if (aaStr.equals(".")) {
// aaStr = refStr;
// }

View File

@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
}
public static Double getPQ(Genotype gt) {
return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
return d == -1 ? null : d;
}
public static boolean topMatchesTop(AllelePair b1, AllelePair b2) {

View File

@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator {
public int getComparisonOrder() { return 1; } // need only the evals
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ! vc1.isBiallelic() && vc1.isIndel() ) {
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
return vc1.toString(); // biallelic sites are output
}
if ( vc1.isIndel() ) {
if ( vc1.isIndel() && vc1.isPolymorphic() ) {
if ( ! vc1.isBiallelic() ) {
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
return vc1.toString(); // biallelic sites are output
}
// only count simple insertions/deletions, not complex indels
if ( vc1.isSimpleInsertion() ) {
indelHistogram.update(vc1.getAlternateAllele(0).length());
} else if ( vc1.isSimpleDeletion() ) {
indelHistogram.update(-vc1.getReference().length());
} else {
throw new ReviewedStingException("Indel type that is not insertion or deletion.");
}
}

View File

@ -270,7 +270,7 @@ public class IndelStatistics extends VariantEvaluator {
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (eval != null ) {
if (eval != null && eval.isPolymorphic()) {
if ( indelStats == null ) {
indelStats = new IndelStats(eval);
}

View File

@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
if ( eval.hasGenotypes() )
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
else if ( eval.hasAttribute("AC") ) {
ac = Integer.valueOf(eval.getAttributeAsString("AC"));
ac = eval.getAttributeAsInt("AC", -1);
}
if ( ac != -1 ) {
@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
}
}
if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) {
if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) {
metrics.incrValue(eval);
}
}

View File

@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
}
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) {
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) {
return null; //no interesting sites
}
if (vc.hasGenotypes()) {
//this maps allele to a count
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
//this maps allele to a count
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
int numHetsHere = 0;
float numGenosHere = 0;
int numIndsHere = 0;
int numHetsHere = 0;
float numGenosHere = 0;
int numIndsHere = 0;
for (Genotype genotype : vc.getGenotypes().values()) {
numIndsHere++;
if (!genotype.isNoCall()) {
//increment stats for heterozygosity
if (genotype.isHet()) {
numHetsHere++;
}
for (Genotype genotype : vc.getGenotypes().values()) {
numIndsHere++;
if (!genotype.isNoCall()) {
//increment stats for heterozygosity
if (genotype.isHet()) {
numHetsHere++;
}
numGenosHere++;
//increment stats for pairwise mismatches
numGenosHere++;
//increment stats for pairwise mismatches
for (Allele allele : genotype.getAlleles()) {
if (allele.isNonNull() && allele.isCalled()) {
String alleleString = allele.toString();
alleleCounts.putIfAbsent(alleleString, 0);
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
}
for (Allele allele : genotype.getAlleles()) {
if (allele.isNonNull() && allele.isCalled()) {
String alleleString = allele.toString();
alleleCounts.putIfAbsent(alleleString, 0);
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
}
}
}
if (numGenosHere > 0) {
//only if have one called genotype at least
this.numSites++;
}
if (numGenosHere > 0) {
//only if have one called genotype at least
this.numSites++;
this.totalHet += numHetsHere / numGenosHere;
this.totalHet += numHetsHere / numGenosHere;
//compute based on num sites
float harmonicFactor = 0;
for (int i = 1; i <= numIndsHere; i++) {
harmonicFactor += 1.0 / i;
}
this.thetaRegionNumSites += 1.0 / harmonicFactor;
//compute based on num sites
float harmonicFactor = 0;
for (int i = 1; i <= numIndsHere; i++) {
harmonicFactor += 1.0 / i;
}
this.thetaRegionNumSites += 1.0 / harmonicFactor;
//now compute pairwise mismatches
float numPairwise = 0;
float numDiffs = 0;
for (String allele1 : alleleCounts.keySet()) {
int allele1Count = alleleCounts.get(allele1);
//now compute pairwise mismatches
float numPairwise = 0;
float numDiffs = 0;
for (String allele1 : alleleCounts.keySet()) {
int allele1Count = alleleCounts.get(allele1);
for (String allele2 : alleleCounts.keySet()) {
if (allele1.compareTo(allele2) < 0) {
continue;
}
if (allele1 .compareTo(allele2) == 0) {
numPairwise += allele1Count * (allele1Count - 1) * .5;
for (String allele2 : alleleCounts.keySet()) {
if (allele1.compareTo(allele2) < 0) {
continue;
}
if (allele1 .compareTo(allele2) == 0) {
numPairwise += allele1Count * (allele1Count - 1) * .5;
}
else {
int allele2Count = alleleCounts.get(allele2);
numPairwise += allele1Count * allele2Count;
numDiffs += allele1Count * allele2Count;
}
}
else {
int allele2Count = alleleCounts.get(allele2);
numPairwise += allele1Count * allele2Count;
numDiffs += allele1Count * allele2Count;
}
}
}
if (numPairwise > 0) {
this.totalAvgDiffs += numDiffs / numPairwise;
}
if (numPairwise > 0) {
this.totalAvgDiffs += numDiffs / numPairwise;
}
}

View File

@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
}
public void updateTiTv(VariantContext vc, boolean updateStandard) {
if (vc != null && vc.isSNP() && vc.isBiallelic()) {
if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) {
if (VariantContextUtils.isTransition(vc)) {
if (updateStandard) nTiInComp++;
else nTi++;
@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
else nTv++;
}
String refStr = vc.getReference().getBaseString().toUpperCase();
String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase();
if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) {
BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]);
//System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType);
if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) {
nTiDerived++;
} else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) {
nTvDerived++;
if (vc.hasAttribute("ANCESTRALALLELE")) {
final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase();
if ( ! aaStr.equals(".") ) {
switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) {
case TRANSITION: nTiDerived++; break;
case TRANSVERSION: nTvDerived++; break;
default: break;
}
}
}
}

View File

@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
public SiteStatus calcSiteStatus(VariantContext vc) {
if ( vc == null ) return SiteStatus.NO_CALL;
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
if ( ! vc.isVariant() ) return SiteStatus.MONO;
if ( vc.isMonomorphic() ) return SiteStatus.MONO;
if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
int ac = 0;
@ -130,10 +131,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
//// System.out.printf(" ac = %d%n", ac);
}
else
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
} else if ( vc.hasGenotypes() ) {
return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO;
} else {
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
//return SiteStatus.NO_CALL; // we can't figure out what to do

View File

@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator {
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String interesting = null;
if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
if( titvStats == null ) { titvStats = new TiTvStats(); }
titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));

View File

@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier {
if (eval != null) {
int AC = -1;
if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) {
AC = eval.getAttributeAsInt("AC");
AC = eval.getAttributeAsInt("AC", 0);
} else if ( eval.isVariant() ) {
for (Allele allele : eval.getAlternateAlleles())
AC = Math.max(AC, eval.getChromosomeCount(allele));

View File

@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier {
if (eval != null) {
try {
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3))));
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3))));
} catch (Exception e) {
return relevantStates;
}

View File

@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier {
Integer frame = null;
if (eval.hasAttribute("refseq.functionalClass")) {
aa = eval.getAttributeAsString("refseq.variantAA");
frame = eval.getAttributeAsInt("refseq.frame");
aa = eval.getAttributeAsString("refseq.variantAA", null);
frame = eval.getAttributeAsInt("refseq.frame", 0);
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
String newtype = eval.getAttributeAsString(key);
String newtype = eval.getAttributeAsString(key, null);
if ( newtype != null &&
( type == null ||
@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier {
type = newtype;
String aakey = String.format("refseq.variantAA_%d", annotationId);
aa = eval.getAttributeAsString(aakey);
aa = eval.getAttributeAsString(aakey, null);
if (aa != null) {
String framekey = String.format("refseq.frame_%d", annotationId);
if (eval.hasAttribute(framekey)) {
frame = eval.getAttributeAsInt(framekey);
frame = eval.getAttributeAsInt(framekey, 0);
}
}
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
@ -11,25 +12,34 @@ import java.util.List;
* Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
*/
public class FunctionalClass extends VariantStratifier {
@Override
public void initialize() {
states.add("all");
states.add("silent");
states.add("missense");
states.add("nonsense");
public enum FunctionalType {
silent,
missense,
nonsense
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
@Override
public void initialize() {
states.add("all");
for ( FunctionalType type : FunctionalType.values() )
states.add(type.name());
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
ArrayList<String> relevantStates = new ArrayList<String>();
relevantStates.add("all");
if (eval != null && eval.isVariant()) {
String type = null;
FunctionalType type = null;
if (eval.hasAttribute("refseq.functionalClass")) {
type = eval.getAttributeAsString("refseq.functionalClass");
try {
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null));
} catch ( Exception e ) {} // don't error out if the type isn't supported
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
String newtype = eval.getAttributeAsString(key);
if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
( type == null ||
( type.equals("silent") && !newtype.equals("silent") ) ||
( type.equals("missense") && newtype.equals("nonsense") ) )
) {
type = newtype;
String newtypeStr = eval.getAttributeAsString(key, null);
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
try {
FunctionalType newType = FunctionalType.valueOf(newtypeStr);
if ( type == null ||
( type == FunctionalType.silent && newType != FunctionalType.silent ) ||
( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) {
type = newType;
}
} catch ( Exception e ) {} // don't error out if the type isn't supported
}
annotationId++;
} while (eval.hasAttribute(key));
} else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) {
SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString());
if ( snpEffType == SnpEff.EffectType.STOP_GAINED )
type = FunctionalType.nonsense;
else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING )
type = FunctionalType.missense;
else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING )
type = FunctionalType.silent;
}
if (type != null) {
if (type.equals("silent")) { relevantStates.add("silent"); }
else if (type.equals("missense")) { relevantStates.add("missense"); }
else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
if ( type != null ) {
relevantStates.add(type.name());
}
}

View File

@ -277,7 +277,7 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples
*/
public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection<String> sampleNames) {
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values());
VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles());
HashMap<String, Object> newAts = new HashMap<String, Object>(vcsub.getAttributes());

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/12/11
*/
public class TrainingSet {
public RodBinding<VariantContext> rodBinding;
public boolean isKnown = false;
public boolean isTraining = false;
public boolean isAntiTraining = false;
public boolean isTruth = false;
public boolean isConsensus = false;
public double prior = 0.0;
protected final static Logger logger = Logger.getLogger(TrainingSet.class);
public TrainingSet( final RodBinding<VariantContext> rodBinding) {
this.rodBinding = rodBinding;
final Tags tags = rodBinding.getTags();
final String name = rodBinding.getName();
// Parse the tags to decide which tracks have which properties
if( tags != null ) {
isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
}
// Report back to the user which tracks were found and the properties that were detected
if( !isConsensus && !isAntiTraining ) {
logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
} else if( isConsensus ) {
logger.info( String.format( "Found consensus track: %s", name) );
} else {
logger.info( String.format( "Found bad sites training track: %s", name) );
}
}
}

View File

@ -115,7 +115,7 @@ public class VQSRCalibrationCurve {
if ( vc.isFiltered() )
return 0.0;
else if ( vc.hasAttribute(VQSRQualKey) ) {
double qual = vc.getAttributeAsDouble(VQSRQualKey);
double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0);
return probTrueVariant(qual);
} else {
throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
@ -143,7 +143,7 @@ public class VQSRCalibrationCurve {
for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr;
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q);
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q);
updated[i] = Math.log10(q);
}

View File

@ -51,10 +51,10 @@ public class VariantDataManager {
private ExpandingArrayList<VariantDatum> data;
private final double[] meanVector;
private final double[] varianceVector; // this is really the standard deviation
public final ArrayList<String> annotationKeys;
public final List<String> annotationKeys;
private final VariantRecalibratorArgumentCollection VRAC;
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
protected final List<TrainingSet> trainingSets;
public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
this.data = null;
@ -62,6 +62,7 @@ public class VariantDataManager {
this.VRAC = VRAC;
meanVector = new double[this.annotationKeys.size()];
varianceVector = new double[this.annotationKeys.size()];
trainingSets = new ArrayList<TrainingSet>();
}
public void setData( final ExpandingArrayList<VariantDatum> data ) {
@ -104,6 +105,31 @@ public class VariantDataManager {
}
}
public void addTrainingSet( final TrainingSet trainingSet ) {
trainingSets.add( trainingSet );
}
public boolean checkHasTrainingSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isTraining ) { return true; }
}
return false;
}
public boolean checkHasTruthSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isTruth ) { return true; }
}
return false;
}
public boolean checkHasKnownSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isKnown ) { return true; }
}
return false;
}
public ExpandingArrayList<VariantDatum> getTrainingData() {
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
for( final VariantDatum datum : data ) {
@ -232,57 +258,35 @@ public class VariantDataManager {
return value;
}
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap<String, Double> rodToPriorMap,
final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites, final List<RodBinding<VariantContext>> resource) {
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) {
datum.isKnown = false;
datum.atTruthSite = false;
datum.atTrainingSite = false;
datum.atAntiTrainingSite = false;
datum.prior = 2.0;
//BUGBUG: need to clean this up
for( final RodBinding<VariantContext> rod : training ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
for( final TrainingSet trainingSet : trainingSets ) {
for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.atTrainingSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
datum.isKnown = datum.isKnown || trainingSet.isKnown;
datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
datum.prior = Math.max( datum.prior, trainingSet.prior );
datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 );
}
}
}
for( final RodBinding<VariantContext> rod : truth ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.atTruthSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : known ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.isKnown = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : resource ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : badSites ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( trainVC != null ) {
datum.atAntiTrainingSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining;
}
}
}
}
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
}
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
@ -290,10 +294,4 @@ public class VariantDataManager {
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
}
}

View File

@ -77,16 +77,15 @@ import java.util.*;
* <p>
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
*
* <h2>Examples</h2>
* <h2>Example</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -known:prior=8.0 dbsnp_132.b37.vcf \
* -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
* -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
* -recalFile path/to/output.recal \
* -tranchesFile path/to/output.tranches \
@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
public List<RodBinding<VariantContext>> input;
/**
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
*/
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
public List<RodBinding<VariantContext>> training;
/**
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
*/
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
public List<RodBinding<VariantContext>> truth;
/**
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
*/
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
/**
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
*/
@Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
public List<RodBinding<VariantContext>> badSites = Collections.emptyList();
/**
* Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
* Any set of VCF files to use as lists of training, truth, or known sites.
* Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
* Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
*/
@Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
public List<RodBinding<VariantContext>> resource = Collections.emptyList();
@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private PrintStream tranchesStream;
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
private final HashMap<String, Double> rodToPriorMap = new HashMap<String, Double>();
//---------------------------------------------------------------------------------------------------------------
//
@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
}
final ArrayList<RodBinding<VariantContext>> allInputBindings = new ArrayList<RodBinding<VariantContext>>();
allInputBindings.addAll(truth);
allInputBindings.addAll(training);
allInputBindings.addAll(known);
allInputBindings.addAll(badSites);
allInputBindings.addAll(resource);
for( final RodBinding<VariantContext> rod : allInputBindings ) {
try {
rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
} catch( NumberFormatException e ) {
throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
}
for( RodBinding<VariantContext> rod : resource ) {
dataManager.addTrainingSet( new TrainingSet( rod ) );
}
if( !dataManager.checkHasTrainingSet() ) {
throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
if( !dataManager.checkHasTruthSet() ) {
throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
}
@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites, resource ); // BUGBUG: need to clean this up to be a class, not a list of all the rod bindings
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
double priorFactor = QualityUtils.qualToProb( datum.prior );
//if( PERFORM_PROJECT_CONSENSUS ) { // BUGBUG: need to resurrect this functionality?
// final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );

View File

@ -234,16 +234,47 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
return 0;
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
List<VariantContext> preMergedVCs = new ArrayList<VariantContext>();
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for ( VariantContext.Type type : VariantContext.Type.values() ) {
if ( VCsByType.containsKey(type) )
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
// se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record,
// we will still merge those records.
if (preMergedVCs.size() > 1) {
for (VariantContext vc1 : preMergedVCs) {
VariantContext newvc = vc1;
boolean merged = false;
for (int k=0; k < mergedVCs.size(); k++) {
VariantContext vc2 = mergedVCs.get(k);
if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) {
// all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2
List<VariantContext> vcpair = new ArrayList<VariantContext>();
vcpair.add(vc1);
vcpair.add(vc2);
newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair,
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC);
mergedVCs.set(k,newvc);
merged = true;
break;
}
}
if (!merged)
mergedVCs.add(vc1);
}
}
else {
mergedVCs = preMergedVCs;
}
for ( VariantContext mergedVC : mergedVCs ) {
// only operate at the start of events
if ( mergedVC == null )

View File

@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
writer = new StandardVCFWriter(file, false);
writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false);
writer.writeHeader(vcfHeader);
}

View File

@ -75,7 +75,7 @@ public class RandomlySplitVariants extends RodWalker<Integer, Integer> {
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
vcfWriter1.writeHeader(new VCFHeader(hInfo, samples));
vcfWriter2 = new StandardVCFWriter(file2, true);
vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true);
vcfWriter2.writeHeader(new VCFHeader(hInfo, samples));
}

View File

@ -145,10 +145,9 @@ import java.util.*;
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -SM family.yaml \
* -family NA12891+NA12892=NA12878 \
* -mvq 50
* -mvq 50 \
* -o violations.vcf
*
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
@ -265,17 +264,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private File AF_FILE = new File("");
@Hidden
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false)
private File FAMILY_STRUCTURE_FILE = null;
/**
* String formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
@Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
@Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = "";
/**
* Sample metadata information will be taken from a YAML file (see the -SM argument).
* This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
*/
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
@ -306,7 +305,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
private String outMVFile = null;
/* Private class used to store the intermediate variants in the integer random selection process */
@ -575,7 +574,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
// We then pick a variant with probablity AF*desiredFraction
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
double af;
double afBoost = 1.0;

View File

@ -56,11 +56,6 @@ import java.util.Set;
* A variant set to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A filtered VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \

View File

@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
* Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes)
*
* <p>
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
@ -57,7 +57,16 @@ import java.util.*;
*
* <h2>Output</h2>
* <p>
* An annotated VCF.
* An annotated VCF. Additionally, a table like the following will be output:
* <pre>
* Total number of samples assayed: 185
* Total number of records processed: 152
* Number of Hardy-Weinberg violations: 34 (22%)
* Number of no-call violations: 12 (7%)
* Number of homozygous variant violations: 0 (0%)
* Number of records passing all filters: 106 (69%)
* Number of passing records that are polymorphic: 98 (92%)
* </pre>
* </p>
*
* <h2>Examples</h2>

View File

@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( getters.containsKey(field) ) {
val = getters.get(field).get(vc);
} else if ( vc.hasAttribute(field) ) {
val = vc.getAttributeAsString(field);
val = vc.getAttributeAsString(field, null);
} else if ( isWildCard(field) ) {
Set<String> wildVals = new HashSet<String>();
for ( Map.Entry<String,Object> elt : vc.getAttributes().entrySet()) {
@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });

View File

@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
@Override
public int hashCode() {
return (int)( start << 16 + stop << 4 + contigIndex );
return start << 16 | stop << 4 | contigIndex;
}

View File

@ -1056,42 +1056,30 @@ public class MathUtils {
}
static public double softMax(final double x, final double y) {
if (Double.isInfinite(x))
return y;
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
if (Double.isInfinite(y))
return x;
// slow exact version:
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
double diff = x-y;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
//double diff = Math.abs(x-y);
double diff = x-y;
double t1 =x;
if (diff<0) { //
t1 = y;
diff= -diff;
}
// t has max(x,y), diff has abs(x-y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP);
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+jacobianLogTable[ind];
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
if (diff > MAX_JACOBIAN_TOLERANCE)
return x;
else if (diff < -MAX_JACOBIAN_TOLERANCE)
return y;
else if (diff >= 0) {
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
return x + jacobianLogTable[ind];
}
else {
int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
return y + jacobianLogTable[ind];
}
}
public static double phredScaleToProbability (byte q) {
return Math.pow(10,(-q)/10.0);

View File

@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils;
* @author Kiran Garimella
*/
public class QualityUtils {
public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
public final static double MIN_REASONABLE_ERROR = 0.0001;
public final static byte MAX_REASONABLE_Q_SCORE = 40;
public final static byte MIN_USABLE_Q_SCORE = 6;
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
private static double qualToErrorProbCache[] = new double[256];
static {
for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i);
}
/**
* Private constructor. No instantiating this class!
*/
@ -33,10 +36,6 @@ public class QualityUtils {
return 1.0 - qualToErrorProb(qual);
}
static public double qualToProb(int qual) {
return qualToProb( (double)qual );
}
static public double qualToProb(double qual) {
return 1.0 - Math.pow(10.0, qual/(-10.0));
}
@ -48,10 +47,14 @@ public class QualityUtils {
* @param qual a quality score (0-40)
* @return a probability (0.0-1.0)
*/
static public double qualToErrorProb(byte qual) {
static public double qualToErrorProbRaw(byte qual) {
return Math.pow(10.0, ((double) qual)/-10.0);
}
static public double qualToErrorProb(byte qual) {
return qualToErrorProbCache[qual];
}
/**
* Convert a probability to a quality score. Note, this is capped at Q40.
*
@ -110,88 +113,4 @@ public class QualityUtils {
//return (byte) Math.min(qual, maxQual);
return (byte) Math.max(Math.min(qual, maxQual), 1);
}
/**
* Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field.
* Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis.
* Another note: the probability encoded here gets rounded to the nearest 1%.
*
* @param baseIndex the base index
* @param prob the base probability
* @return a byte containing the index and the probability
*/
static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) {
byte compressedQual = 0;
compressedQual = (byte) baseIndex;
byte cprob = (byte) (100.0*prob);
byte qualmask = (byte) 252;
compressedQual += ((cprob << 2) & qualmask);
return compressedQual;
}
/**
* From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T)
*
* @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
* @return base index
*/
static public int compressedQualityToBaseIndex(byte compressedQual) {
return (int) (compressedQual & 0x3);
}
/**
* From a compressed base, extract the base probability
*
* @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
* @return the probability
*/
static public double compressedQualityToProb(byte compressedQual) {
// Because java natives are signed, extra care must be taken to avoid
// shifting a 1 into the sign bit in the implicit promotion of 2 to an int.
int x2 = ((int) compressedQual) & 0xff;
x2 = (x2 >>> 2);
return ((double) x2)/100.0;
}
/**
* Return the complement of a compressed quality
*
* @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality)
* @return the complementary compressed quality
*/
static public byte complementCompressedQuality(byte compressedQual) {
int baseIndex = compressedQualityToBaseIndex(compressedQual);
double prob = compressedQualityToProb(compressedQual);
return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob);
}
/**
* Return the reverse complement of a byte array of compressed qualities
*
* @param compressedQuals a byte array of compressed quality scores
* @return the reverse complement of the byte array
*/
static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) {
byte[] rcCompressedQuals = new byte[compressedQuals.length];
for (int pos = 0; pos < compressedQuals.length; pos++) {
rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]);
}
return rcCompressedQuals;
}
/**
* Return the reverse of a byte array of qualities (compressed or otherwise)
* @param quals the array of bytes to be reversed
* @return the reverse of the quality array
*/
static public byte[] reverseQualityArray( byte[] quals ) {
return Utils.reverse(quals); // no sense in duplicating functionality
}
}

View File

@ -190,11 +190,21 @@ public class SampleUtils {
}
public static List<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
/**
* Returns a new set of samples, containing a final list of samples expanded from sampleArgs
*
* Each element E of sampleArgs can either be a literal sample name or a file. For each E,
* we try to read a file named E from disk, and if possible all lines from that file are expanded
* into unique sample names.
*
* @param sampleArgs
* @return
*/
public static Set<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
if (sampleArgs != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
List<String> samplesFromFiles = new ArrayList<String>();
Set<String> samplesFromFiles = new HashSet<String>();
for (String SAMPLE_EXPRESSION : sampleArgs) {
File sampleFile = new File(SAMPLE_EXPRESSION);
@ -203,7 +213,7 @@ public class SampleUtils {
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line);
samplesFromFiles.add(line.trim());
}
} catch (FileNotFoundException e) {
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
@ -212,7 +222,8 @@ public class SampleUtils {
return samplesFromFiles;
}
return new ArrayList<String>();
return new HashSet<String>();
}
public static Set<String> getSamplesFromCommandLineInput(Collection<String> vcfSamples, Collection<String> sampleExpressions) {

View File

@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.ArrayList;
/**
* TODO FOR CHRIS HARTL
* Allows for reading in RefSeq information
*
* <p>
* Codec Description
* Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
* strandedness of transcription.
* </p>
*
* <p>
* See also: link to file specification
* Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here
* <a href="http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq">http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq</a>
* </p>
* <h2> Usage </h2>
* The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example
* <pre>
* -refSeqBinding:REFSEQ /path/to/refSeq.txt
* </pre>
*
* You will need to consult individual walkers for the binding name ("refSeqBinding", above)
*
* <h2>File format example</h2>
* If you want to define your own file for use, the format is (tab delimited):
* bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
* and exon frames, for example:
* <pre>
* 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
* </pre>
* for more information see <a href="http://skip.ucsc.edu/cgi-bin/hgTables?hgsid=5651&hgta_doSchemaDb=mm8&hgta_doSchemaTable=refGene">here</a>
* <p>
* A BAM file containing <b>exactly one sample</b>.
*
* </p>
*
* @author Mark DePristo

View File

@ -1,282 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
/**
* Codec for decoding the output format of the SnpEff variant effect predictor tool
*
* <p>
* This format has 23 tab-delimited fields:
*
* <pre>
* Chromosome
* Position
* Reference
* Change
* Change Type: {SNP, MNP, INS, DEL}
* Zygosity: {Hom, Het}
* Quality
* Coverage
* Warnings
* Gene ID
* Gene Name
* Bio Type
* Transcript ID
* Exon ID
* Exon Rank
* Effect
* Old/New Amino Acid
* Old/New Codon
* Codon Num
* CDS Size
* Codons Around
* Amino Acids Around
* Custom Interval ID
* </pre>
* Note that we treat all except the Chromosome, Position, and Effect fields as optional.
* </p>
*
* <p>
* See also: @see <a href="http://snpeff.sourceforge.net/">SNPEff project page</a>
* </p>
*
* @author David Roazen
* @since 2011
*/
public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec {
public static final int EXPECTED_NUMBER_OF_FIELDS = 23;
public static final String FIELD_DELIMITER_PATTERN = "\\t";
public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]";
public static final String HEADER_LINE_START = "# ";
public static final String[] HEADER_FIELD_NAMES = { "Chromo",
"Position",
"Reference",
"Change",
"Change type",
"Homozygous",
"Quality",
"Coverage",
"Warnings",
"Gene_ID",
"Gene_name",
"Bio_type",
"Trancript_ID", // yes, this is how it's spelled in the SnpEff output
"Exon_ID",
"Exon_Rank",
"Effect",
"old_AA/new_AA",
"Old_codon/New_codon",
"Codon_Num(CDS)",
"CDS_size",
"Codons around",
"AAs around",
"Custom_interval_ID"
};
// The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line:
public static final int[] REQUIRED_FIELDS = { 0, 1, 15 };
public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE";
public Feature decodeLoc ( String line ) {
return decode(line);
}
public Feature decode ( String line ) {
String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1);
if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS +
") number of fields: found " + tokens.length + " fields.", line);
}
try {
trimAllFields(tokens);
checkForRequiredFields(tokens, line);
String contig = tokens[0];
long position = Long.parseLong(tokens[1]);
String reference = tokens[2].isEmpty() ? null : tokens[2];
String change = tokens[3].isEmpty() ? null : tokens[3];
ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]);
Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]);
Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]);
Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]);
String warnings = tokens[8].isEmpty() ? null : tokens[8];
String geneID = tokens[9].isEmpty() ? null : tokens[9];
String geneName = tokens[10].isEmpty() ? null : tokens[10];
String bioType = tokens[11].isEmpty() ? null : tokens[11];
String transcriptID = tokens[12].isEmpty() ? null : tokens[12];
String exonID = tokens[13].isEmpty() ? null : tokens[13];
Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]);
boolean isNonCodingGene = isNonCodingGene(tokens[15]);
// Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present,
// otherwise split it into two subfields. We need this limit to prevent the extra effect-related information
// in the final field (when present) from being inappropriately tokenized:
int effectFieldTokenLimit = isNonCodingGene ? 3 : 2;
String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit);
EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene);
String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene);
String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16];
String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17];
Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]);
Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]);
String codonsAround = tokens[20].isEmpty() ? null : tokens[20];
String aasAround = tokens[21].isEmpty() ? null : tokens[21];
String customIntervalID = tokens[22].isEmpty() ? null : tokens[22];
return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage,
warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene,
effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize,
codonsAround, aasAround, customIntervalID);
}
catch ( NumberFormatException e ) {
throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line);
}
catch ( IllegalArgumentException e ) {
throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line);
}
}
private void trimAllFields ( String[] tokens ) {
for ( int i = 0; i < tokens.length; i++ ) {
tokens[i] = tokens[i].trim();
}
}
private void checkForRequiredFields ( String[] tokens, String line ) {
for ( int requiredFieldIndex : REQUIRED_FIELDS ) {
if ( tokens[requiredFieldIndex].isEmpty() ) {
throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" +
HEADER_FIELD_NAMES[requiredFieldIndex] + "\"",
line);
}
}
}
private boolean isNonCodingGene ( String effectField ) {
return effectField.startsWith(NON_CODING_GENE_FLAG);
}
private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) {
String effectName = "";
// If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield,
// otherwise it will be in the first subfield:
if ( effectFieldTokens.length > 1 && isNonCodingGene ) {
effectName = effectFieldTokens[1].trim();
}
else {
effectName = effectFieldTokens[0].trim();
}
return EffectType.valueOf(effectName);
}
private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) {
// The extra effect-related information, if present, will always be the last subfield:
if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) {
return effectFieldTokens[effectFieldTokens.length - 1].trim();
}
return null;
}
public Class<SnpEffFeature> getFeatureType() {
return SnpEffFeature.class;
}
public Object readHeader ( LineReader reader ) {
String headerLine = "";
try {
headerLine = reader.readLine();
}
catch ( IOException e ) {
throw new TribbleException("Unable to read header line from input file.");
}
validateHeaderLine(headerLine);
return headerLine;
}
private void validateHeaderLine ( String headerLine ) {
if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) {
throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START);
}
String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN);
if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" +
EXPECTED_NUMBER_OF_FIELDS + ") of columns.");
}
for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) {
if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) {
throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" +
HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" +
headerTokens[columnIndex] + "\"");
}
}
}
public boolean canDecode ( final File potentialInput ) {
try {
LineReader reader = new AsciiLineReader(new FileInputStream(potentialInput));
readHeader(reader);
}
catch ( Exception e ) {
return false;
}
return true;
}
}

View File

@ -1,115 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
/**
* A set of constants associated with the SnpEff codec.
*
* @author David Roazen
*/
public class SnpEffConstants {
// Possible SnpEff biological effects and their associated impacts:
public enum EffectType {
START_GAINED (EffectImpact.HIGH),
START_LOST (EffectImpact.HIGH),
EXON_DELETED (EffectImpact.HIGH),
FRAME_SHIFT (EffectImpact.HIGH),
STOP_GAINED (EffectImpact.HIGH),
STOP_LOST (EffectImpact.HIGH),
SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH),
SPLICE_SITE_DONOR (EffectImpact.HIGH),
NON_SYNONYMOUS_CODING (EffectImpact.MODERATE),
UTR_5_DELETED (EffectImpact.MODERATE),
UTR_3_DELETED (EffectImpact.MODERATE),
CODON_INSERTION (EffectImpact.MODERATE),
CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE),
CODON_DELETION (EffectImpact.MODERATE),
CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE),
NONE (EffectImpact.LOW),
CHROMOSOME (EffectImpact.LOW),
INTERGENIC (EffectImpact.LOW),
UPSTREAM (EffectImpact.LOW),
UTR_5_PRIME (EffectImpact.LOW),
SYNONYMOUS_START (EffectImpact.LOW),
NON_SYNONYMOUS_START (EffectImpact.LOW),
CDS (EffectImpact.LOW),
GENE (EffectImpact.LOW),
TRANSCRIPT (EffectImpact.LOW),
EXON (EffectImpact.LOW),
SYNONYMOUS_CODING (EffectImpact.LOW),
CODON_CHANGE (EffectImpact.LOW),
SYNONYMOUS_STOP (EffectImpact.LOW),
NON_SYNONYMOUS_STOP (EffectImpact.LOW),
INTRON (EffectImpact.LOW),
UTR_3_PRIME (EffectImpact.LOW),
DOWNSTREAM (EffectImpact.LOW),
INTRON_CONSERVED (EffectImpact.LOW),
INTERGENIC_CONSERVED (EffectImpact.LOW),
CUSTOM (EffectImpact.LOW);
private final EffectImpact impact;
EffectType ( EffectImpact impact ) {
this.impact = impact;
}
public EffectImpact getImpact() {
return impact;
}
}
public enum EffectImpact {
LOW (1),
MODERATE (2),
HIGH (3);
private final int severityRating;
EffectImpact ( int severityRating ) {
this.severityRating = severityRating;
}
public boolean isHigherImpactThan ( EffectImpact other ) {
return this.severityRating > other.severityRating;
}
}
// The kinds of variants supported by the SnpEff output format:
public enum ChangeType {
SNP,
MNP,
INS,
DEL
}
// Possible zygosities of SnpEff variants:
public enum Zygosity {
Hom,
Het
}
}

View File

@ -1,423 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
import org.broad.tribble.Feature;
import java.util.NoSuchElementException;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
/**
* Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output.
*
* Many fields are optional, and missing values are represented by nulls. You should always call the
* hasX() method before calling the corresponding getX() method. Required fields can never be null
* and do not have a hasX() method.
*
* @author David Roazen
*/
public class SnpEffFeature implements Feature {
private String contig; // REQUIRED FIELD
private long position; // REQUIRED FIELD
private String reference;
private String change;
private ChangeType changeType;
private Zygosity zygosity;
private Double quality;
private Long coverage;
private String warnings;
private String geneID;
private String geneName;
private String bioType;
private String transcriptID;
private String exonID;
private Integer exonRank;
private boolean isNonCodingGene; // REQUIRED FIELD
private EffectType effect; // REQUIRED FIELD
private String effectExtraInformation;
private String oldAndNewAA;
private String oldAndNewCodon;
private Integer codonNum;
private Integer cdsSize;
private String codonsAround;
private String aasAround;
private String customIntervalID;
public SnpEffFeature ( String contig,
long position,
String reference,
String change,
ChangeType changeType,
Zygosity zygosity,
Double quality,
Long coverage,
String warnings,
String geneID,
String geneName,
String bioType,
String transcriptID,
String exonID,
Integer exonRank,
boolean isNonCodingGene,
EffectType effect,
String effectExtraInformation,
String oldAndNewAA,
String oldAndNewCodon,
Integer codonNum,
Integer cdsSize,
String codonsAround,
String aasAround,
String customIntervalID ) {
if ( contig == null || effect == null ) {
throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields");
}
this.contig = contig;
this.position = position;
this.reference = reference;
this.change = change;
this.changeType = changeType;
this.zygosity = zygosity;
this.quality = quality;
this.coverage = coverage;
this.warnings = warnings;
this.geneID = geneID;
this.geneName = geneName;
this.bioType = bioType;
this.transcriptID = transcriptID;
this.exonID = exonID;
this.exonRank = exonRank;
this.isNonCodingGene = isNonCodingGene;
this.effect = effect;
this.effectExtraInformation = effectExtraInformation;
this.oldAndNewAA = oldAndNewAA;
this.oldAndNewCodon = oldAndNewCodon;
this.codonNum = codonNum;
this.cdsSize = cdsSize;
this.codonsAround = codonsAround;
this.aasAround = aasAround;
this.customIntervalID = customIntervalID;
}
public boolean isHigherImpactThan ( SnpEffFeature other ) {
// If one effect is in a non-coding gene and the other is not, the effect NOT in the
// non-coding gene has higher impact:
if ( ! isNonCodingGene() && other.isNonCodingGene() ) {
return true;
}
else if ( isNonCodingGene() && ! other.isNonCodingGene() ) {
return false;
}
// Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts
// of the effects themselves as defined in the SnpEffConstants class:
return getEffectImpact().isHigherImpactThan(other.getEffectImpact());
}
public String getChr() {
return contig;
}
public int getStart() {
return (int)position;
}
public int getEnd() {
return (int)position;
}
public boolean hasReference() {
return reference != null;
}
public String getReference() {
if ( reference == null ) throw new NoSuchElementException("This feature has no reference field");
return reference;
}
public boolean hasChange() {
return change != null;
}
public String getChange() {
if ( change == null ) throw new NoSuchElementException("This feature has no change field");
return change;
}
public boolean hasChangeType() {
return changeType != null;
}
public ChangeType getChangeType() {
if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field");
return changeType;
}
public boolean hasZygosity() {
return zygosity != null;
}
public Zygosity getZygosity() {
if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field");
return zygosity;
}
public boolean hasQuality() {
return quality != null;
}
public Double getQuality() {
if ( quality == null ) throw new NoSuchElementException("This feature has no quality field");
return quality;
}
public boolean hasCoverage() {
return coverage != null;
}
public Long getCoverage() {
if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field");
return coverage;
}
public boolean hasWarnings() {
return warnings != null;
}
public String getWarnings() {
if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field");
return warnings;
}
public boolean hasGeneID() {
return geneID != null;
}
public String getGeneID() {
if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field");
return geneID;
}
public boolean hasGeneName() {
return geneName != null;
}
public String getGeneName() {
if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field");
return geneName;
}
public boolean hasBioType() {
return bioType != null;
}
public String getBioType() {
if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field");
return bioType;
}
public boolean hasTranscriptID() {
return transcriptID != null;
}
public String getTranscriptID() {
if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field");
return transcriptID;
}
public boolean hasExonID() {
return exonID != null;
}
public String getExonID() {
if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field");
return exonID;
}
public boolean hasExonRank() {
return exonRank != null;
}
public Integer getExonRank() {
if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field");
return exonRank;
}
public boolean isNonCodingGene() {
return isNonCodingGene;
}
public EffectType getEffect() {
return effect;
}
public EffectImpact getEffectImpact() {
return effect.getImpact();
}
public boolean hasEffectExtraInformation() {
return effectExtraInformation != null;
}
public String getEffectExtraInformation() {
if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field");
return effectExtraInformation;
}
public boolean hasOldAndNewAA() {
return oldAndNewAA != null;
}
public String getOldAndNewAA() {
if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field");
return oldAndNewAA;
}
public boolean hasOldAndNewCodon() {
return oldAndNewCodon != null;
}
public String getOldAndNewCodon() {
if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field");
return oldAndNewCodon;
}
public boolean hasCodonNum() {
return codonNum != null;
}
public Integer getCodonNum() {
if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field");
return codonNum;
}
public boolean hasCdsSize() {
return cdsSize != null;
}
public Integer getCdsSize() {
if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field");
return cdsSize;
}
public boolean hasCodonsAround() {
return codonsAround != null;
}
public String getCodonsAround() {
if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field");
return codonsAround;
}
public boolean hadAasAround() {
return aasAround != null;
}
public String getAasAround() {
if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field");
return aasAround;
}
public boolean hasCustomIntervalID() {
return customIntervalID != null;
}
public String getCustomIntervalID() {
if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field");
return customIntervalID;
}
public boolean equals ( Object o ) {
if ( o == null || ! (o instanceof SnpEffFeature) ) {
return false;
}
SnpEffFeature other = (SnpEffFeature)o;
return contig.equals(other.contig) &&
position == other.position &&
(reference == null ? other.reference == null : reference.equals(other.reference)) &&
(change == null ? other.change == null : change.equals(other.change)) &&
changeType == other.changeType &&
zygosity == other.zygosity &&
(quality == null ? other.quality == null : quality.equals(other.quality)) &&
(coverage == null ? other.coverage == null : coverage.equals(other.coverage)) &&
(warnings == null ? other.warnings == null : warnings.equals(other.warnings)) &&
(geneID == null ? other.geneID == null : geneID.equals(other.geneID)) &&
(geneName == null ? other.geneName == null : geneName.equals(other.geneName)) &&
(bioType == null ? other.bioType == null : bioType.equals(other.bioType)) &&
(transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) &&
(exonID == null ? other.exonID == null : exonID.equals(other.exonID)) &&
(exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) &&
isNonCodingGene == other.isNonCodingGene &&
effect == other.effect &&
(effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) &&
(oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) &&
(oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) &&
(codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) &&
(cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) &&
(codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) &&
(aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) &&
(customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID));
}
public String toString() {
return "[Contig: " + contig +
" Position: " + position +
" Reference: " + reference +
" Change: " + change +
" Change Type: " + changeType +
" Zygosity: " + zygosity +
" Quality: " + quality +
" Coverage: " + coverage +
" Warnings: " + warnings +
" Gene ID: " + geneID +
" Gene Name: " + geneName +
" Bio Type: " + bioType +
" Transcript ID: " + transcriptID +
" Exon ID: " + exonID +
" Exon Rank: " + exonRank +
" Non-Coding Gene: " + isNonCodingGene +
" Effect: " + effect +
" Effect Extra Information: " + effectExtraInformation +
" Old/New AA: " + oldAndNewAA +
" Old/New Codon: " + oldAndNewCodon +
" Codon Num: " + codonNum +
" CDS Size: " + cdsSize +
" Codons Around: " + codonsAround +
" AAs Around: " + aasAround +
" Custom Interval ID: " + customIntervalID +
"]";
}
}

View File

@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
throw new UserException.MalformedVCF(message, lineNo);
}
private static void generateException(String message, int lineNo) {
protected static void generateException(String message, int lineNo) {
throw new UserException.MalformedVCF(message, lineNo);
}

View File

@ -0,0 +1,144 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.Tribble;
import org.broad.tribble.TribbleException;
import org.broad.tribble.index.DynamicIndexCreator;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
/**
* this class writes VCF files
*/
public abstract class IndexingVCFWriter implements VCFWriter {
final private String name;
private final SAMSequenceDictionary refDict;
private OutputStream outputStream;
private PositionalStream positionalStream = null;
private DynamicIndexCreator indexer = null;
private LittleEndianOutputStream idxStream = null;
@Requires({"name != null",
"! ( location == null && output == null )",
"! ( enableOnTheFlyIndexing && location == null )"})
protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
outputStream = output;
this.name = name;
this.refDict = refDict;
if ( enableOnTheFlyIndexing ) {
try {
idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
//System.out.println("Creating index on the fly for " + location);
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
indexer.initialize(location, indexer.defaultBinSize());
positionalStream = new PositionalStream(output);
outputStream = positionalStream;
} catch ( IOException ex ) {
// No matter what we keep going, since we don't care if we can't create the index file
idxStream = null;
indexer = null;
positionalStream = null;
}
}
}
@Ensures("result != null")
public OutputStream getOutputStream() {
return outputStream;
}
@Ensures("result != null")
public String getStreamName() {
return name;
}
public abstract void writeHeader(VCFHeader header);
/**
* attempt to close the VCF file
*/
public void close() {
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
try {
Index index = indexer.finalizeIndex(positionalStream.getPosition());
IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
index.write(idxStream);
idxStream.close();
} catch (IOException e) {
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
}
}
}
/**
* add a record to the file
*
* @param vc the Variant Context object
*/
public void add(VariantContext vc) {
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
if ( indexer != null )
indexer.addFeature(vc, positionalStream.getPosition());
}
/**
* Returns a reasonable "name" for this writer, to display to the user if something goes wrong
*
* @param location
* @param stream
* @return
*/
protected static final String writerName(final File location, final OutputStream stream) {
return location == null ? stream.toString() : location.getAbsolutePath();
}
/**
* Returns a output stream writing to location, or throws a UserException if this fails
* @param location
* @return
*/
protected static OutputStream openOutputStream(final File location) {
try {
return new FileOutputStream(location);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
}
}
}

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.Tribble;
import org.broad.tribble.TribbleException;
import org.broad.tribble.index.DynamicIndexCreator;
@ -44,46 +45,30 @@ import java.util.*;
/**
* this class writes VCF files
*/
public class StandardVCFWriter implements VCFWriter {
public class StandardVCFWriter extends IndexingVCFWriter {
// the print stream we're writing to
final protected BufferedWriter mWriter;
// should we write genotypes or just sites?
final protected boolean doNotWriteGenotypes;
// the VCF header we're storing
protected VCFHeader mHeader = null;
// the print stream we're writing to
protected BufferedWriter mWriter;
protected PositionalStream positionalStream = null;
// were filters applied?
protected boolean filtersWereAppliedToContext = false;
// should we write genotypes or just sites?
protected boolean doNotWriteGenotypes = false;
protected DynamicIndexCreator indexer = null;
protected File indexFile = null;
LittleEndianOutputStream idxStream = null;
File location = null;
/**
* create a VCF writer, given a file to write to
*
* @param location the file location to write to
*/
public StandardVCFWriter(File location) {
this(location, openOutputStream(location), true, false);
public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) {
this(location, openOutputStream(location), refDict, true, false);
}
public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) {
this(location, openOutputStream(location), enableOnTheFlyIndexing, false);
}
/**
* create a VCF writer, given a stream to write to
*
* @param output the file location to write to
*/
public StandardVCFWriter(OutputStream output) {
this(output, false);
public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) {
this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false);
}
/**
@ -92,33 +77,23 @@ public class StandardVCFWriter implements VCFWriter {
* @param output the file location to write to
* @param doNotWriteGenotypes do not write genotypes
*/
public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) {
mWriter = new BufferedWriter(new OutputStreamWriter(output));
public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) {
this(null, output, refDict, false, doNotWriteGenotypes);
}
public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
this.location = location;
if ( enableOnTheFlyIndexing ) {
indexFile = Tribble.indexFile(location);
try {
idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
//System.out.println("Creating index on the fly for " + location);
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
indexer.initialize(location, indexer.defaultBinSize());
positionalStream = new PositionalStream(output);
output = positionalStream;
} catch ( IOException ex ) {
// No matter what we keep going, since we don't care if we can't create the index file
}
}
//mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output)));
mWriter = new BufferedWriter(new OutputStreamWriter(output));
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
// --------------------------------------------------------------------------------
//
// VCFWriter interface functions
//
// --------------------------------------------------------------------------------
@Override
public void writeHeader(VCFHeader header) {
mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
@ -158,44 +133,24 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.flush(); // necessary so that writing to an output stream will work
}
catch (IOException e) {
throw new TribbleException("IOException writing the VCF header to " + locationString(), e);
throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e);
}
}
private String locationString() {
return location == null ? mWriter.toString() : location.getAbsolutePath();
}
/**
* attempt to close the VCF file
*/
@Override
public void close() {
// try to close the vcf stream
try {
mWriter.flush();
mWriter.close();
} catch (IOException e) {
throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage());
throw new ReviewedStingException("Unable to close " + getStreamName(), e);
}
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
try {
Index index = indexer.finalizeIndex(positionalStream.getPosition());
index.write(idxStream);
idxStream.close();
} catch (IOException e) {
throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage());
}
}
}
protected static OutputStream openOutputStream(File location) {
try {
return new FileOutputStream(location);
} catch (FileNotFoundException e) {
throw new TribbleException("Unable to create VCF file at location: " + location);
}
super.close();
}
/**
@ -203,28 +158,17 @@ public class StandardVCFWriter implements VCFWriter {
*
* @param vc the Variant Context object
*/
@Override
public void add(VariantContext vc) {
add(vc, false);
}
/**
* add a record to the file
*
* @param vc the Variant Context object
* @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD)
*/
public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString());
throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());
if ( doNotWriteGenotypes )
vc = VariantContext.modifyGenotypes(vc, null);
try {
vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles);
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition());
vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false);
super.add(vc);
Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
@ -275,7 +219,7 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// FILTER
String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
String filters = getFilterString(vc, filtersWereAppliedToContext);
mWriter.write(filters);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -317,9 +261,22 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write("\n");
mWriter.flush(); // necessary so that writing to an output stream will work
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to " + locationString());
throw new RuntimeException("Unable to write the VCF object to " + getStreamName());
}
}
// --------------------------------------------------------------------------------
//
// implementation functions
//
// --------------------------------------------------------------------------------
public static final String getFilterString(final VariantContext vc) {
return getFilterString(vc, false);
}
public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
private String getQualValue(double qual) {
@ -462,7 +419,7 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write(encoding);
}
private static String formatVCFField(Object val) {
public static String formatVCFField(Object val) {
String result;
if ( val == null )
result = VCFConstants.MISSING_VALUE_v4;
@ -524,12 +481,11 @@ public class StandardVCFWriter implements VCFWriter {
}
public static int countOccurrences(char c, String s) {
private static int countOccurrences(char c, String s) {
int count = 0;
for (int i = 0; i < s.length(); i++) {
count += s.charAt(i) == c ? 1 : 0;
}
return count;
}
}

View File

@ -105,34 +105,37 @@ public class VCFCodec extends AbstractVCFCodec {
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/
protected Set<String> parseFilters(String filterString) {
return parseFilters(filterHash, lineNo, filterString);
}
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
return fFields;
return Collections.emptySet();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4");
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status");
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached?
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
if ( cache != null && cache.containsKey(filterString) )
return Collections.unmodifiableSet(cache.get(filterString));
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
filterHash.put(filterString, fFields);
fFields = fFields;
if ( cache != null ) cache.put(filterString, fFields);
return fFields;
return Collections.unmodifiableSet(fFields);
}

View File

@ -24,6 +24,7 @@ public class VCFHeader {
private final Set<VCFHeaderLine> mMetaData;
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
@ -110,6 +111,9 @@ public class VCFHeader {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getName(), formatLine);
}
else {
mOtherMetaData.put(line.getKey(), line);
}
}
}
@ -185,6 +189,14 @@ public class VCFHeader {
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
return mFormatMetaData.get(key);
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFHeaderLine getOtherHeaderLine(String key) {
return mOtherMetaData.get(key);
}
}

View File

@ -0,0 +1,256 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.*;
/**
* GATK binary VCF record
*
* @author Your Name
* @since Date created
*/
public class GCF {
private final static int RECORD_TERMINATOR = 123456789;
private int chromOffset;
private int start, stop;
private String id;
private List<Allele> alleleMap;
private int alleleOffsets[];
private float qual;
private byte refPad;
private String info;
private int filterOffset;
private List<GCFGenotype> genotypes = Collections.emptyList();
public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) {
chromOffset = GCFHeaderBuilder.encodeString(vc.getChr());
start = vc.getStart();
stop = vc.getEnd();
refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0;
id = vc.getID();
// encode alleles
alleleMap = new ArrayList<Allele>(vc.getNAlleles());
alleleOffsets = new int[vc.getNAlleles()];
alleleMap.add(vc.getReference());
alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
alleleMap.add(vc.getAlternateAllele(i));
alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i));
}
qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual());
info = infoFieldString(vc, GCFHeaderBuilder);
filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc));
if ( ! skipGenotypes ) {
genotypes = encodeGenotypes(GCFHeaderBuilder, vc);
}
}
public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
chromOffset = inputStream.readInt();
// have we reached the footer?
if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
throw new EOFException();
start = inputStream.readInt();
stop = inputStream.readInt();
id = inputStream.readUTF();
refPad = inputStream.readByte();
alleleOffsets = readIntArray(inputStream);
qual = inputStream.readFloat();
info = inputStream.readUTF();
filterOffset = inputStream.readInt();
int nGenotypes = inputStream.readInt();
int sizeOfGenotypes = inputStream.readInt();
if ( skipGenotypes ) {
genotypes = Collections.emptyList();
inputStream.skipBytes(sizeOfGenotypes);
} else {
genotypes = new ArrayList<GCFGenotype>(nGenotypes);
for ( int i = 0; i < nGenotypes; i++ )
genotypes.add(new GCFGenotype(this, inputStream));
}
int recordDone = inputStream.readInt();
if ( recordDone != RECORD_TERMINATOR )
throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
}
public int write(DataOutputStream outputStream) throws IOException {
int startSize = outputStream.size();
outputStream.writeInt(chromOffset);
outputStream.writeInt(start);
outputStream.writeInt(stop);
outputStream.writeUTF(id);
outputStream.writeByte(refPad);
writeIntArray(alleleOffsets, outputStream, true);
outputStream.writeFloat(qual);
outputStream.writeUTF(info);
outputStream.writeInt(filterOffset);
int nGenotypes = genotypes.size();
int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
outputStream.writeInt(nGenotypes);
outputStream.writeInt(expectedSizeOfGenotypes);
int obsSizeOfGenotypes = 0;
for ( GCFGenotype g : genotypes )
obsSizeOfGenotypes += g.write(outputStream);
if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
outputStream.writeInt(RECORD_TERMINATOR);
return outputStream.size() - startSize;
}
public VariantContext decode(final String source, final GCFHeader header) {
final String contig = header.getString(chromOffset);
alleleMap = header.getAlleles(alleleOffsets);
double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual);
Set<String> filters = header.getFilters(filterOffset);
Map<String, Object> attributes = new HashMap<String, Object>();
attributes.put("INFO", info);
Byte refPadByte = refPad == 0 ? null : refPad;
Map<String, Genotype> genotypes = decodeGenotypes(header);
return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte);
}
private Map<String, Genotype> decodeGenotypes(final GCFHeader header) {
if ( genotypes.isEmpty() )
return VariantContext.NO_GENOTYPES;
else {
Map<String, Genotype> map = new TreeMap<String, Genotype>();
for ( int i = 0; i < genotypes.size(); i++ ) {
final String sampleName = header.getSample(i);
final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap);
map.put(sampleName, g);
}
return map;
}
}
private List<GCFGenotype> encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) {
int nGenotypes = vc.getNSamples();
if ( nGenotypes > 0 ) {
List<GCFGenotype> genotypes = new ArrayList<GCFGenotype>(nGenotypes);
for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null);
for ( Genotype g : vc.getGenotypes().values() ) {
int i = GCFHeaderBuilder.encodeSample(g.getSampleName());
genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g));
}
return genotypes;
} else {
return Collections.emptyList();
}
}
public int getNAlleles() { return alleleOffsets.length; }
private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
StringBuilder s = new StringBuilder();
boolean first = true;
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
String key = field.getKey();
if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
continue;
int stringIndex = GCFHeaderBuilder.encodeString(key);
String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
if ( outputValue != null ) {
if ( ! first ) s.append(";");
s.append(stringIndex).append("=").append(outputValue);
first = false;
}
}
return s.toString();
}
protected final static int BUFFER_SIZE = 1048576; // 2**20
public static DataInputStream createDataInputStream(final InputStream stream) {
return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
}
public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
return new FileInputStream(file);
}
protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
return readIntArray(inputStream, inputStream.readInt());
}
protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException {
int[] array = new int[size];
for ( int i = 0; i < array.length; i++ )
array[i] = inputStream.readInt();
return array;
}
protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
if ( writeSize ) outputStream.writeInt(array.length);
for ( int i : array )
outputStream.writeInt(i);
}
protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException {
return readByteArray(inputStream, inputStream.readInt());
}
protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException {
byte[] array = new byte[size];
for ( int i = 0; i < array.length; i++ )
array[i] = inputStream.readByte();
return array;
}
protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
if ( writeSize ) outputStream.writeInt(array.length);
for ( byte i : array )
outputStream.writeByte(i);
}
protected final static byte qualToByte(double phredScaledQual) {
return (byte)Math.round(Math.min(phredScaledQual, 255));
}
}

View File

@ -0,0 +1,147 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.*;
/**
* GATK binary VCF record
*
* @author Your Name
* @since Date created
*/
public class GCFGenotype {
private byte gq;
private int gt;
private int dp;
private int ad[];
private byte[] pl;
// todo -- what to do about phasing? Perhaps we shouldn't support it
// todo -- is the FL field generic or just a flag? Should we even support per sample filtering?
public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List<Allele> allAlleles, Genotype genotype) {
gq = GCF.qualToByte(genotype.getPhredScaledQual());
gt = encodeAlleles(genotype.getAlleles(), allAlleles);
dp = genotype.getAttributeAsInt("DP", 0);
int nAlleles = allAlleles.size();
ad = new int[nAlleles];
int npls = nAllelesToNPls(nAlleles);
pl = new byte[npls];
}
private int nAllelesToNPls( int nAlleles ) {
return nAlleles*(nAlleles+1) / 2;
}
public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException {
int gqInt = inputStream.readUnsignedByte();
gq = (byte)gqInt;
gt = inputStream.readInt();
dp = inputStream.readInt();
ad = GCF.readIntArray(inputStream, GCF.getNAlleles());
pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles()));
}
// 2 alleles => 1 + 8 + 8 + 3 => 20
protected int sizeInBytes() {
return 1 // gq
+ 4 * 2 // gt + dp
+ 4 * ad.length // ad
+ 1 * pl.length; // pl
}
public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List<Allele> alleleIndex) {
final List<Allele> alleles = decodeAlleles(gt, alleleIndex);
final double negLog10PError = gq / 10.0;
final Set<String> filters = Collections.emptySet();
final Map<String, Object> attributes = new HashMap<String, Object>();
attributes.put("DP", dp);
attributes.put("AD", ad);
attributes.put("PL", pl);
return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false);
}
private static int encodeAlleles(List<Allele> gtList, List<Allele> allAlleles) {
final int nAlleles = gtList.size();
if ( nAlleles > 4 )
throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList);
int gtInt = 0;
for ( int i = 0; i < nAlleles ; i++ ) {
final int bitOffset = i * 8;
final int allelei = getAlleleIndex(gtList.get(i), allAlleles);
final int gti = (allelei + 1) << bitOffset;
gtInt = gtInt | gti;
}
return gtInt;
}
private static int getAlleleIndex(Allele q, List<Allele> allAlleles) {
if ( q.isNoCall() )
return 254;
for ( int i = 0; i < allAlleles.size(); i++ )
if ( q.equals(allAlleles.get(i)) )
return i;
throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles);
}
private static List<Allele> decodeAlleles(int gtInt, List<Allele> alleleIndex) {
List<Allele> alleles = new ArrayList<Allele>(4);
for ( int i = 0; i < 32; i += 8 ) {
final int gi = (gtInt & (0x000000FF << i)) >> i;
if ( gi != 0 ) {
final int allelei = gi - 1;
alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) );
} else {
break;
}
}
return alleles;
}
public int write(DataOutputStream outputStream) throws IOException {
int startSize = outputStream.size();
outputStream.writeByte(gq);
outputStream.writeInt(gt);
outputStream.writeInt(dp);
GCF.writeIntArray(ad, outputStream, false);
GCF.writeByteArray(pl, outputStream, false);
return outputStream.size() - startSize;
}
}

View File

@ -0,0 +1,205 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.io.*;
import java.util.*;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h2>Input</h2>
* <p>
* [Input description]
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* [Output description]
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public class GCFHeader {
final protected static Logger logger = Logger.getLogger(GCFHeader.class);
public final static int GCF_VERSION = 1;
public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes();
public final static int FOOTER_START_MARKER = -1;
public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version
final int version;
long footerPosition;
final List<Allele> alleles;
final List<String> strings;
final List<String> samples;
final List<Set<String>> filters;
public GCFHeader(final Map<Allele, Integer> allelesIn, final Map<String, Integer> stringIn, final Map<String, Integer> samplesIn) {
version = GCF_VERSION;
footerPosition = 0;
this.alleles = linearize(allelesIn);
this.strings = linearize(stringIn);
this.samples = linearize(samplesIn);
this.filters = null; // not used with this constructor
}
public GCFHeader(FileInputStream fileInputStream) throws IOException {
DataInputStream inputStream = new DataInputStream(fileInputStream);
byte[] headerTest = new byte[GCF_FILE_START_MARKER.length];
inputStream.read(headerTest);
if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) {
throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest));
} else {
version = inputStream.readInt();
logger.info("Read GCF version " + version);
footerPosition = inputStream.readLong();
logger.info("Read footer position of " + footerPosition);
long lastPos = fileInputStream.getChannel().position();
logger.info(" Last position is " + lastPos);
// seek to the footer
fileInputStream.getChannel().position(footerPosition);
if ( inputStream.readInt() != FOOTER_START_MARKER )
throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker");
alleles = stringsToAlleles(readStrings(inputStream));
strings = readStrings(inputStream);
samples = readStrings(inputStream);
logger.info(String.format("Allele map of %d elements", alleles.size()));
logger.info(String.format("String map of %d elements", strings.size()));
logger.info(String.format("Sample map of %d elements", samples.size()));
filters = initializeFilterCache();
fileInputStream.getChannel().position(lastPos);
}
}
public static int writeHeader(final DataOutputStream outputStream) throws IOException {
int startBytes = outputStream.size();
outputStream.write(GCF_FILE_START_MARKER);
outputStream.writeInt(GCF_VERSION);
outputStream.writeLong(0);
return outputStream.size() - startBytes;
}
public int writeFooter(final DataOutputStream outputStream) throws IOException {
int startBytes = outputStream.size();
outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding
write(outputStream, allelesToStrings(alleles));
write(outputStream, strings);
write(outputStream, samples);
return outputStream.size() - startBytes;
}
private void write(DataOutputStream outputStream, List<String> l) throws IOException {
outputStream.writeInt(l.size());
for ( String elt : l ) outputStream.writeUTF(elt);
}
private List<String> allelesToStrings(List<Allele> alleles) {
List<String> strings = new ArrayList<String>(alleles.size());
for ( Allele allele : alleles ) strings.add(allele.toString());
return strings;
}
private List<Set<String>> initializeFilterCache() {
// required to allow offset -> set lookup
List<Set<String>> l = new ArrayList<Set<String>>(strings.size());
for ( int i = 0; i < strings.size(); i++ ) l.add(null);
return l;
}
private static List<Allele> stringsToAlleles(final List<String> strings) {
final List<Allele> alleles = new ArrayList<Allele>(strings.size());
for ( String string : strings ) {
boolean isRef = string.endsWith("*");
if ( isRef ) string = string.substring(0, string.length() - 1);
alleles.add(Allele.create(string, isRef));
}
return alleles;
}
private static List<String> readStrings(final DataInputStream inputStream) throws IOException {
final int nStrings = inputStream.readInt();
final List<String> strings = new ArrayList<String>(nStrings);
for ( int i = 0; i < nStrings; i++ ) {
strings.add(inputStream.readUTF());
}
return strings;
}
private static <T> List<T> linearize(final Map<T, Integer> map) {
final ArrayList<T> l = new ArrayList<T>(map.size());
for ( int i = 0; i < map.size(); i++ ) l.add(null);
for ( final Map.Entry<T, Integer> elt : map.entrySet() )
l.set(elt.getValue(), elt.getKey());
return l;
}
public String getSample(final int offset) { return samples.get(offset); }
public String getString(final int offset) { return strings.get(offset); }
public Allele getAllele(final int offset) { return alleles.get(offset); }
public List<Allele> getAlleles(final int[] offsets) {
final List<Allele> alleles = new ArrayList<Allele>(offsets.length);
for ( int i : offsets ) alleles.add(getAllele(i));
return alleles;
}
public Set<String> getFilters(final int offset) {
Set<String> cached = filters.get(offset);
if ( cached != null )
return cached;
else {
final String filterString = getString(offset);
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null; // UNFILTERED records are represented by null
else {
Set<String> set = VCFCodec.parseFilters(null, -1, filterString);
filters.set(offset, set); // remember the result
return set;
}
}
}
}

View File

@ -0,0 +1,80 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.HashMap;
import java.util.Map;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h2>Input</h2>
* <p>
* [Input description]
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* [Output description]
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public class GCFHeaderBuilder {
Map<Allele, Integer> alleles = new HashMap<Allele, Integer>();
Map<String, Integer> strings = new HashMap<String, Integer>();
Map<String, Integer> samples = new HashMap<String, Integer>();
public GCFHeader createHeader() {
return new GCFHeader(alleles, strings, samples);
}
public int encodeString(final String chr) { return encode(strings, chr); }
public int encodeAllele(final Allele allele) { return encode(alleles, allele); }
public int encodeSample(final String sampleName) { return encode(samples, sampleName); }
private <T> int encode(Map<T, Integer> map, T key) {
Integer v = map.get(key);
if ( v == null ) {
v = map.size();
map.put(key, v);
}
return v;
}
}

View File

@ -0,0 +1,123 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
/**
* GCFWriter implementing the VCFWriter interface
* @author Your Name
* @since Date created
*/
public class GCFWriter extends IndexingVCFWriter {
final boolean skipGenotypes;
final FileOutputStream fileOutputStream;
final DataOutputStream dataOutputStream;
final GCFHeaderBuilder gcfHeaderBuilder;
int nbytes = 0;
VCFHeader header = null;
File location;
// --------------------------------------------------------------------------------
//
// Constructors
//
// --------------------------------------------------------------------------------
public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing);
this.location = location;
this.skipGenotypes = doNotWriteGenotypes;
// write the output
try {
fileOutputStream = new FileOutputStream(location);
dataOutputStream = createDataOutputStream(fileOutputStream);
gcfHeaderBuilder = new GCFHeaderBuilder();
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(location, e);
}
}
// --------------------------------------------------------------------------------
//
// VCFWriter interface functions
//
// --------------------------------------------------------------------------------
@Override
public void writeHeader(VCFHeader header) {
this.header = header;
try {
nbytes += GCFHeader.writeHeader(dataOutputStream);
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e);
}
}
@Override
public void add(VariantContext vc) {
super.add(vc);
GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes);
try {
nbytes += gcf.write(dataOutputStream);
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e);
}
}
@Override
public void close() {
// todo -- write out VCF header lines
GCFHeader gcfHeader = gcfHeaderBuilder.createHeader();
try {
long headerPosition = nbytes;
nbytes += gcfHeader.writeFooter(dataOutputStream);
dataOutputStream.close();
//System.out.println("Writing forward reference to " + headerPosition);
RandomAccessFile raFile = new RandomAccessFile(location, "rw");
raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET);
raFile.writeLong(headerPosition);
raFile.close();
} catch ( IOException e ) {
throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e);
}
super.close();
}
private static final DataOutputStream createDataOutputStream(final OutputStream stream) {
return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE));
}
}

View File

@ -334,24 +334,44 @@ public class IntervalUtils {
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* Splits an interval list into multiple sublists.
* @param locs The genome locs to split.
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to.
* @return A list of lists of genome locs, split according to splits
*/
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
int locIndex = 1;
int start = 0;
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
for (Integer stop: splits) {
IntervalList intervalList = new IntervalList(fileHeader);
List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
for (int i = start; i < stop; i++)
intervalList.add(toInterval(locs.get(i), locIndex++));
intervalList.write(scatterParts.get(fileIndex++));
curList.add(locs.get(i));
start = stop;
sublists.add(curList);
}
return sublists;
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* @param splits Pre-divided genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to.
*/
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
int locIndex = 1;
for (final List<GenomeLoc> split : splits) {
IntervalList intervalList = new IntervalList(fileHeader);
for (final GenomeLoc loc : split)
intervalList.add(toInterval(loc, locIndex++));
intervalList.write(scatterParts.get(fileIndex++));
}
}
@ -361,17 +381,15 @@ public class IntervalUtils {
* @param numParts Number of parts to split the locs into.
* @return The stop points to split the genome locs.
*/
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
if (locs.size() < numParts)
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
long locsSize = 0;
for (GenomeLoc loc: locs)
locsSize += loc.size();
List<Integer> splitPoints = new ArrayList<Integer>();
final long locsSize = intervalSize(locs);
final List<Integer> splitPoints = new ArrayList<Integer>();
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
Collections.sort(splitPoints);
splitPoints.add(locs.size());
return splitPoints;
return splitIntervalsToSubLists(locs, splitPoints);
}
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
@ -441,4 +459,11 @@ public class IntervalUtils {
return merged;
}
}
public static final long intervalSize(final List<GenomeLoc> locs) {
long size = 0;
for ( final GenomeLoc loc : locs )
size += loc.size();
return size;
}
}

View File

@ -118,31 +118,40 @@ public class ReadUtils {
/**
* This enum represents all the different ways in which a read can overlap an interval.
*
* NO_OVERLAP:
* NO_OVERLAP_CONTIG:
* read and interval are in different contigs.
*
* NO_OVERLAP_LEFT:
* the read does not overlap the interval.
*
* |----------------| (interval)
* <----------------> (read)
*
* NO_OVERLAP_RIGHT:
* the read does not overlap the interval.
*
* |----------------| (interval)
* <----------------> (read)
*
* LEFT_OVERLAP:
* OVERLAP_LEFT:
* the read starts before the beginning of the interval but ends inside of it
*
* |----------------| (interval)
* <----------------> (read)
*
* RIGHT_OVERLAP:
* OVERLAP_RIGHT:
* the read starts inside the interval but ends outside of it
*
* |----------------| (interval)
* <----------------> (read)
*
* FULL_OVERLAP:
* OVERLAP_LEFT_AND_RIGHT:
* the read starts before the interval and ends after the interval
*
* |-----------| (interval)
* <-------------------> (read)
*
* CONTAINED:
* OVERLAP_CONTAINED:
* the read starts and ends inside the interval
*
* |----------------| (interval)
@ -658,7 +667,7 @@ public class ReadUtils {
return ReadAndIntervalOverlap.OVERLAP_RIGHT;
}
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
public static int getRefCoordSoftUnclippedStart(SAMRecord read) {
int start = read.getUnclippedStart();
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
@ -670,9 +679,13 @@ public class ReadUtils {
return start;
}
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
public static int getRefCoordSoftUnclippedEnd(SAMRecord read) {
int stop = read.getUnclippedStart();
if (readIsEntirelyInsertion(read))
return stop;
int shift = 0;
CigarOperator lastOperator = null;
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
@ -686,6 +699,14 @@ public class ReadUtils {
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
}
private static boolean readIsEntirelyInsertion(SAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
if (cigarElement.getOperator() != CigarOperator.INSERTION)
return false;
}
return true;
}
/**
* Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before
* the alignment start of the read.

View File

@ -293,17 +293,8 @@ public class Genotype {
return commonInfo.getAttribute(key, defaultValue);
}
public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
}

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import java.util.*;
@ -204,27 +206,40 @@ public final class InferredGeneticContext {
return defaultValue;
}
// public AttributedObject getAttributes(Collection<Object> keys) {
// AttributedObject selected = new AttributedObject();
//
// for ( Object key : keys )
// selected.putAttribute(key, this.getAttribute(key));
//
// return selected;
// }
public String getAttributeAsString(String key, String defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof String ) return (String)x;
return String.valueOf(x); // throws an exception if this isn't a string
}
public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
public int getAttributeAsInt(String key, int defaultValue) {
Object x = getAttribute(key);
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
if ( x instanceof Integer ) return (Integer)x;
return Integer.valueOf((String)x); // throws an exception if this isn't a string
}
public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); }
public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
}
public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Boolean ) return (Boolean)x;
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
}
// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
}

View File

@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
return commonInfo.getAttribute(key, defaultValue);
}
public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
// ---------------------------------------------------------------------------------------------------------
//
// Working with alleles
@ -817,6 +807,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati
throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this);
}
/**
* @param other VariantContext whose alternate alleles to compare against
* @return true if this VariantContext has the same alternate alleles as other,
* regardless of ordering. Otherwise returns false.
*/
public boolean hasSameAlternateAllelesAs ( VariantContext other ) {
Set<Allele> thisAlternateAlleles = getAlternateAlleles();
Set<Allele> otherAlternateAlleles = other.getAlternateAlleles();
if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) {
return false;
}
for ( Allele allele : thisAlternateAlleles ) {
if ( ! otherAlternateAlleles.contains(allele) ) {
return false;
}
}
return true;
}
// ---------------------------------------------------------------------------------------------------------
//
// Working with genotypes
@ -983,7 +995,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
* @return true if it's monomorphic
*/
public boolean isMonomorphic() {
return ! isVariant() || getChromosomeCount(getReference()) == getChromosomeCount();
return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples());
}
/**
@ -1085,14 +1097,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
}
public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
// don't validate if we're an insertion or complex event
if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) {
throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
// don't validate if we're a complex event
if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) {
throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
}
// we also need to validate the padding base for simple indels
if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) )
throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), (char)getReferenceBaseForIndel().byteValue(), (char)paddedRefBase.byteValue()));
if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) {
throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue()));
}
}
public void validateRSIDs(Set<String> rsIDs) {

View File

@ -565,11 +565,11 @@ public class VariantContextUtils {
// special case DP (add it up) and ID (just preserve it)
//
if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY));
depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
if (rsID == null && vc.hasID())
rsID = vc.getID();
if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY);
String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null);
// lets see if the string contains a , separator
if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) {
List<String> alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
@ -663,6 +663,18 @@ public class VariantContextUtils {
return merged;
}
public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) {
// if all alleles of vc1 are a contained in alleles of vc2, return true
if (!vc1.getReference().equals(vc2.getReference()))
return false;
for (Allele a :vc1.getAlternateAlleles()) {
if (!vc2.getAlternateAlleles().contains(a))
return false;
}
return true;
}
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
boolean trimVC;
@ -1147,9 +1159,7 @@ public class VariantContextUtils {
for (String orAttrib : MERGE_OR_ATTRIBS) {
boolean attribVal = false;
for (VariantContext vc : vcList) {
Boolean val = vc.getAttributeAsBooleanNoException(orAttrib);
if (val != null)
attribVal = (attribVal || val);
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
if (attribVal) // already true, so no reason to continue:
break;
}
@ -1159,7 +1169,7 @@ public class VariantContextUtils {
// Merge ID fields:
String iDVal = null;
for (VariantContext vc : vcList) {
String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY);
String val = vc.getAttributeAsString(VariantContext.ID_KEY, null);
if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) {
if (iDVal == null)
iDVal = val;
@ -1239,8 +1249,10 @@ public class VariantContextUtils {
public PhaseAndQuality(Genotype gt) {
this.isPhased = gt.isPhased();
if (this.isPhased)
this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
if (this.isPhased) {
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
if ( this.PQ == -1 ) this.PQ = null;
}
}
}

View File

@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest {
Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec());
Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath());
if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) {
if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) {
Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n",
indexFromOutputFile.getProperties(),
dynamicIndex.getProperties()));

View File

@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -45,7 +44,6 @@ import org.testng.annotations.Test;
import java.io.*;
import java.nio.channels.FileChannel;
import java.util.Map;
/**
@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
try {
Index idx = builder.loadIndex(vcfFile, new VCFCodec());
// catch any exception; this call should pass correctly
SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx);
SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx);
} catch (IOException e) {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());

View File

@ -127,6 +127,7 @@ public class TraverseReadsUnitTest extends BaseTest {
Object accumulator = countReadWalker.reduceInit();
while (shardStrategy.hasNext()) {
traversalEngine.startTimersIfNecessary();
Shard shard = shardStrategy.next();
if (shard == null) {

Some files were not shown because too many files have changed in this diff Show More