Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/stable

This commit is contained in:
Matt Hanna 2011-11-16 09:57:13 -05:00
commit 6a5d5e7ac9
375 changed files with 13496 additions and 9533 deletions

20
.gitignore vendored 100644
View File

@ -0,0 +1,20 @@
/*.bam
/*.bai
/*.bed
*.idx
*~
/*.vcf
/*.txt
/*.csh
/.*
/*.pdf
/*.eval
*.ipr
*.iws
*.iml
.DS_Store
queueScatterGather
/foo*
/bar*
integrationtests/
public/testdata/onTheFlyOutputTest.vcf

156
build.xml
View File

@ -28,6 +28,8 @@
<property name="build.dir" value="build" />
<property name="dist.dir" value="dist" />
<property name="contract.dump.dir" value="dump" />
<property name="pipelinetest.dir" value="pipelinetests" />
<property name="lib.dir" value="lib" />
<property name="external.dir" value="external" />
<property name="public.dir" value="public" />
@ -35,18 +37,26 @@
<property name="java.public.source.dir" value="${public.dir}/java/src" />
<property name="java.private.source.dir" value="${private.dir}/java/src" />
<property name="java.classes" value="${build.dir}/java/classes" />
<property name="R.public.scripts.dir" value="${public.dir}/R/scripts" />
<property name="R.private.scripts.dir" value="${private.dir}/R/scripts" />
<property name="R.public.src.dir" value="${public.dir}/R/src" />
<!-- Legacy: Installing libraries back into the source directory
instead of the build or dist directory... intentionally avoids ant clean?? -->
<property name="R.library.dir" value="${public.dir}/R" />
<property name="R.tar.dir" value="${build.dir}/R/src" />
<property name="R.package.path" value="org/broadinstitute/sting/utils/R" />
<property name="resource.file" value="StingText.properties" />
<property name="resource.path" value="${java.classes}/StingText.properties" />
<property name="scala.public.source.dir" value="${public.dir}/scala/src" />
<property name="scala.private.source.dir" value="${private.dir}/scala/src" />
<property name="scala.private.source.dir" value="${private.dir}/scala/src" />
<property name="scala.classes" value="${build.dir}/scala/classes" />
<property name="queue-extensions.source.dir" value="${build.dir}/queue-extensions/src" />
<property name="javadoc.dir" value="javadoc" />
<property name="scaladoc.dir" value="scaladoc" />
<!-- Contracts for Java -->
<!-- By default, enabled only for test targets -->
<!-- To disable for test targets, run with -Duse.contracts=false -->
@ -60,7 +70,7 @@
<!-- do we want to halt on failure of a unit test? default to yes (Bamboo uses 'no') -->
<property name="halt" value="yes" />
<!-- should our unit test output go to a file or the screen?
false means it goes to the screen (default) true to file -->
<property name="usefile" value="false" />
@ -82,7 +92,7 @@
<patternset refid="java.source.pattern" />
</fileset>
<!-- terrible hack to get gatkdocs to see all files -->
<!-- terrible hack to get gatkdocs to see all files -->
<patternset id="all.java.source.pattern">
<include name="${java.public.source.dir}/**/*.java" />
<include name="${java.private.source.dir}/**/*.java" />
@ -113,7 +123,7 @@
<exclude name="testng*.jar" />
<exclude name="bcel*.jar" />
</patternset>
<path id="external.dependencies">
<fileset dir="${lib.dir}">
<patternset refid="dependency.mask" />
@ -154,16 +164,18 @@
<property name="ivy.jar.file" value="ivy-${ivy.install.version}.jar"/>
<property name="ivy.settings.dir" value="settings"/>
<property file="${ivy.settings.dir}/ivysettings.properties"/>
<mkdir dir="${lib.dir}"/>
<mkdir dir="${ivy.jar.dir}"/>
<!-- Comment out the following two lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->
<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"
dest="${ivy.jar.dir}/${ivy.jar.file}"
usetimestamp="true"/>
<taskdef resource="org/apache/ivy/ant/antlib.xml"
uri="antlib:org.apache.ivy.ant"
classpath="${ivy.jar.dir}/${ivy.jar.file}"/>
<ivy:settings file="${ivy.settings.dir}/ivysettings.xml"/>
<property name="init.resolve.done" value="true"/>
</target>
@ -209,11 +221,11 @@
<equals arg1="${git.describe.exit.value}" arg2="0" />
</condition>
</target>
<target name="tagged.build.version" depends="git.describe" if="git.describe.succeeded">
<property name="build.version" value="${git.describe.output}" />
</target>
<target name="git.rev-parse" depends="git.describe" unless="git.describe.succeeded">
<exec executable="git" outputproperty="git.rev-parse.output" resultproperty="git.rev-parse.exit.value" failonerror="false">
<arg line="rev-parse HEAD" />
@ -222,11 +234,11 @@
<equals arg1="${git.rev-parse.exit.value}" arg2="0" />
</condition>
</target>
<target name="untagged.build.version" depends="git.rev-parse" if="git.rev-parse.succeeded">
<property name="build.version" value="${git.rev-parse.output}" />
</target>
<target name="generate.build.version" depends="tagged.build.version, untagged.build.version">
<!-- Set build.version to exported if no other value has been set -->
<property name="build.version" value="exported" />
@ -264,7 +276,7 @@
<echo message="Scala build : ${scala.target}"/>
<echo message="source revision : ${build.version}"/>
<echo message="build time : ${build.timestamp}" />
<condition property="include.private">
<equals arg1="${gatk.target}" arg2="private" casesensitive="false" />
</condition>
@ -310,13 +322,13 @@
<target name="gatk.compile.public.source" depends="init,resolve">
<javac fork="true" srcdir="${java.public.source.dir}" memoryMaximumSize="512m" destdir="${java.classes}" debug="true" debuglevel="lines,vars,source" classpathref="external.dependencies" tempdir="${java.io.tmpdir}">
<compilerarg value="-proc:none"/>
</javac>
</javac>
</target>
<target name="gatk.compile.private.source" depends="gatk.compile.public.source" if="include.private">
<javac fork="true" srcdir="${java.private.source.dir}" memoryMaximumSize="512m" destdir="${java.classes}" debug="true" debuglevel="lines,vars,source" classpathref="external.dependencies" tempdir="${java.io.tmpdir}">
<compilerarg value="-proc:none"/>
</javac>
</javac>
</target>
<target name="gatk.compile.external.source" depends="gatk.compile.public.source,gatk.compile.private.source">
@ -325,11 +337,11 @@
<property name="dist.dir" value="${external.dist.dir}" />
<property name="gatk.classpath" value="${external.gatk.classpath}" />
<fileset dir="${external.dir}" includes="*/build.xml" erroronmissingdir="false" />
</subant>
</subant>
</target>
<target name="gatk.compile.source"
depends="gatk.compile.public.source,gatk.compile.private.source,gatk.compile.external.source"
<target name="gatk.compile.source"
depends="gatk.compile.public.source,gatk.compile.private.source,gatk.compile.external.source"
description="compile the GATK source" />
<target name="gatk.contracts.public" depends="gatk.compile.source" if="include.contracts">
@ -339,9 +351,9 @@
<pathelement path="${java.classes}" />
</classpath>
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
<compilerarg value="-Acom.google.java.contract.dump=${contract.dump.dir}"/>
<compilerarg value="-proc:only"/>
</javac>
</javac>
</target>
<target name="check.contracts.private" depends="gatk.contracts.public">
@ -360,14 +372,14 @@
<pathelement path="${java.classes}" />
</classpath>
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
<compilerarg value="-Acom.google.java.contract.dump=${contract.dump.dir}"/>
<compilerarg value="-proc:only"/>
</javac>
</javac>
</target>
<target name="gatk.contracts" depends="gatk.contracts.public,gatk.contracts.private"
<target name="gatk.contracts" depends="gatk.contracts.public,gatk.contracts.private"
description="create GATK contracts" if="include.contracts" />
<target name="gatk.compile" depends="init,resolve,gatk.compile.source,gatk.contracts" />
<target name="init.queue-extensions.generate" depends="gatk.compile">
@ -411,9 +423,9 @@
<src path="${scala.public.source.dir}" />
<src path="${queue-extensions.source.dir}" />
<include name="**/*.scala"/>
</scalac>
</scalac>
</target>
<target name="check.scala.private" depends="scala.compile.public">
<condition property="include.scala.private">
<and>
@ -422,12 +434,12 @@
</and>
</condition>
</target>
<target name="scala.compile.private" depends="check.scala.private" if="include.scala.private">
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.classes}" classpathref="scala.dependencies" deprecation="yes" unchecked="yes">
<src path="${scala.private.source.dir}" />
<include name="**/*.scala"/>
</scalac>
</scalac>
</target>
<target name="scala.compile" depends="scala.compile.public,scala.compile.private" if="scala.include" description="compile Scala" />
@ -530,6 +542,11 @@
<target name="sting.compile" depends="gatk.compile, scala.compile" />
<target name="R.public.tar">
<mkdir dir="${R.tar.dir}/${R.package.path}" />
<tar compression="gzip" basedir="${R.public.src.dir}/${R.package.path}" includes="gsalib/**" destfile="${R.tar.dir}/${R.package.path}/gsalib.tar.gz" />
</target>
<target name="init.jar" depends="sting.compile,extracthelp">
<mkdir dir="${dist.dir}"/>
<copy todir="${dist.dir}">
@ -537,7 +554,7 @@
</copy>
</target>
<target name="sting-utils.jar" depends="gatk.compile, init.jar">
<target name="sting-utils.jar" depends="gatk.compile, init.jar, R.public.tar">
<jar jarfile="${dist.dir}/StingUtils.jar">
<fileset dir="${java.classes}">
<include name="**/utils/**/*.class"/>
@ -549,6 +566,15 @@
<fileset dir="${java.classes}" includes="**/sting/jna/**/*.class"/>
<fileset dir="${java.classes}" includes="net/sf/picard/**/*.class"/>
<fileset dir="${java.classes}" includes="net/sf/samtools/**/*.class"/>
<fileset dir="${R.tar.dir}">
<include name="**/${R.package.path}/**/*.tar.gz"/>
</fileset>
<fileset dir="${R.public.scripts.dir}">
<include name="**/utils/**/*.R"/>
</fileset>
<fileset dir="${R.private.scripts.dir}" erroronmissingdir="false">
<include name="**/utils/**/*.R"/>
</fileset>
<manifest>
<attribute name="Premain-Class" value="org.broadinstitute.sting.utils.instrumentation.Sizeof" />
</manifest>
@ -577,6 +603,14 @@
<include name="**/gatk/**/*.class" />
<include name="**/alignment/**/*.class"/>
</fileset>
<fileset dir="${R.public.scripts.dir}">
<include name="**/gatk/**/*.R"/>
<include name="**/alignment/**/*.R"/>
</fileset>
<fileset dir="${R.private.scripts.dir}" erroronmissingdir="false">
<include name="**/gatk/**/*.R"/>
<include name="**/alignment/**/*.R"/>
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.gatk.CommandLineGATK" />
</manifest>
@ -591,6 +625,14 @@
<include name="**/analyzecovariates/**/*.class" />
<include name="**/gatk/walkers/recalibration/*.class" />
</fileset>
<fileset dir="${R.public.scripts.dir}">
<include name="**/analyzecovariates/**/*.R"/>
<include name="**/gatk/walkers/recalibration/**/*.R"/>
</fileset>
<fileset dir="${R.private.scripts.dir}" erroronmissingdir="false">
<include name="**/analyzecovariates/**/*.R"/>
<include name="**/gatk/walkers/recalibration/**/*.R"/>
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.analyzecovariates.AnalyzeCovariates" />
</manifest>
@ -603,28 +645,7 @@
<fileset dir="${external.dir}" includes="*/build.xml" erroronmissingdir="false" />
</subant>
</target>
<!--
<target name="gatk.oneoffs.jar" depends="gatk.compile, init.jar"
description="generate the GATK oneoffs distribution" if="include.oneoffs">
<jar jarfile="${dist.dir}/CompareBAMAlignments.jar" whenmanifestonly="skip">
<fileset dir="${java.classes}">
<include name="**/tools/**/*.class" />
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.oneoffprojects.tools.CompareBAMAlignments" />
</manifest>
</jar>
<jar jarfile="${dist.dir}/SliceBams.jar" whenmanifestonly="skip">
<fileset dir="${java.classes}">
<include name="**/tools/**/*.class" />
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.playground.tools.SliceBams" />
</manifest>
</jar>
</target>
-->
<target name="scala.jar" depends="scala.compile, init.jar" if="scala.include">
<jar jarfile="${dist.dir}/GATKScala.jar">
<fileset dir="${scala.classes}">
@ -641,6 +662,12 @@
<fileset dir="${java.classes}">
<include name="org/broadinstitute/sting/queue/**/*.class" />
</fileset>
<fileset dir="${R.public.scripts.dir}">
<include name="org/broadinstitute/sting/queue/**/*.R"/>
</fileset>
<fileset dir="${R.private.scripts.dir}" erroronmissingdir="false">
<include name="org/broadinstitute/sting/queue/**/*.R"/>
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.queue.QCommandLine" />
</manifest>
@ -680,20 +707,7 @@
</jar>
</target>
<!--
<target name="gatk.oneoffs.manifests" depends="gatk.oneoffs.jar, init.manifests" if="include.oneoffs">
<jar jarfile="${dist.dir}/CompareBAMAlignments.jar" update="true" whenmanifestonly="skip">
<manifest>
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>
</jar>
<jar jarfile="${dist.dir}/SliceBams.jar" update="true" whenmanifestonly="skip">
<manifest>
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>
</jar>
</target>
-->
<target name="queue.manifests" depends="queue.jar, init.manifests" if="scala.include">
<jar jarfile="${dist.dir}/Queue.jar" update="true" >
<manifest>
@ -778,10 +792,6 @@
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
@ -798,10 +808,6 @@
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
<!--
<compilerarg value="-Acom.google.java.contract.debug"/>
<compilerarg value="-Acom.google.java.contract.dump=dump/"/>
-->
</javac>
</target>
@ -849,6 +855,9 @@
<pathelement location="${java.private.test.classes}" />
<pathelement location="${scala.public.test.classes}" />
<pathelement location="${scala.private.test.classes}" />
<pathelement location="${R.tar.dir}" />
<pathelement location="${R.public.scripts.dir}" />
<pathelement location="${R.private.scripts.dir}" />
</path>
<path id="testng.gatk.releasetest.classpath">
@ -1185,19 +1194,18 @@
</target>
<target name="clean" description="clean up" depends="clean.javadoc,clean.scaladoc,clean.gatkdocs">
<delete dir="out"/>
<delete dir="${build.dir}"/>
<delete dir="${lib.dir}"/>
<delete dir="dump"/>
<delete dir="${contract.dump.dir}"/>
<delete dir="${staging.dir}"/>
<delete dir="${dist.dir}"/>
<delete dir="pipelinetests"/>
<delete dir="${pipelinetest.dir}"/>
</target>
<!-- Build gsalib R module -->
<target name="gsalib">
<exec executable="R" failonerror="true">
<arg line="R CMD INSTALL -l public/R/ public/R/src/gsalib/" />
<arg line="R CMD INSTALL -l ${R.library.dir} ${R.public.src.dir}/${R.package.path}/gsalib" />
</exec>
</target>
</project>

View File

@ -1,5 +1,7 @@
#!/bin/env Rscript
library(tools)
args <- commandArgs(TRUE)
verbose = TRUE
@ -47,6 +49,9 @@ if( is.numeric(c$Covariate) ) {
}
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}
#
# Plot mean quality versus the covariate
@ -69,6 +74,10 @@ if( is.numeric(c$Covariate) ) {
}
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}
#
# Plot histogram of the covariate
#
@ -106,3 +115,7 @@ if( is.numeric(c$Covariate) ) {
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}

View File

@ -1,5 +1,7 @@
#!/bin/env Rscript
library(tools)
args <- commandArgs(TRUE)
input = args[1]
@ -33,6 +35,10 @@ points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
abline(0,1, lty=2)
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}
#
# Plot Q empirical histogram
#
@ -52,6 +58,10 @@ points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}
#
# Plot Q reported histogram
#
@ -68,3 +78,7 @@ plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yM
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}

View File

@ -1,5 +1,7 @@
#!/bin/env Rscript
library(tools)
args <- commandArgs(TRUE)
verbose = TRUE
@ -85,3 +87,7 @@ if ( ! is.null(sensitivity) ) {
}
dev.off()
if (exists('compactPDF')) {
compactPDF(outfile)
}

View File

@ -12,20 +12,20 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
inputFileName = "~/Desktop/broadLocal/GATK/unstable/wgs.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
}
RUNTIME_UNITS = "(sec)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000
RUNTIME_UNITS = "(hours)"
ORIGINAL_UNITS_TO_RUNTIME_UNITS = 1/1000/60/60
#
# Helper function to aggregate all of the jobs in the report across all tables
#
allJobsFromReport <- function(report) {
names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts")
names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts", "runtime")
sub <- lapply(report, function(table) table[,names])
do.call("rbind", sub)
}
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall) {
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
}
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
maxRelTime = max(allJobs$relDoneTime)
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + theme_bw()
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
if ( includeText )
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
p <- p + ylab("Job")
p <- p + xlab(paste("Start time, relative to first job", RUNTIME_UNITS))
p <- p + ylab("Job number")
p <- p + opts(title=title)
print(p)
}
@ -119,7 +121,7 @@ plotGroup <- function(groupTable) {
if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) {
# todo -- how do we group by annotations?
p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram()
p <- p + xlab("runtime in seconds") + ylab("No. of jobs")
p <- p + xlab(paste("runtime", RUNTIME_UNITS)) + ylab("No. of jobs")
p <- p + opts(title=paste("Job runtime histogram for", name))
print(p)
}
@ -139,9 +141,9 @@ print(paste("Project :", inputFileName))
convertUnits <- function(gatkReportData) {
convertGroup <- function(g) {
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
g$runtime = g$runtime * ORIGINAL_UNITS_TO_RUNTIME_UNITS
g$startTime = g$startTime * ORIGINAL_UNITS_TO_RUNTIME_UNITS
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_RUNTIME_UNITS
g
}
lapply(gatkReportData, convertGroup)
@ -157,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T)
plotJobsGantt(gatkReportData, F)
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotProgressByTime(gatkReportData)
for ( group in gatkReportData ) {
plotGroup(group)

View File

@ -99,5 +99,5 @@ gsa.read.gatkreport <- function(filename) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
gatkreport = as.list(tableEnv);
gatkreport = as.list(tableEnv, all.names=TRUE);
}

View File

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 49 KiB

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.alignment;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
@ -35,6 +34,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Iterator;
@ -72,12 +72,13 @@ public class AlignmentValidationWalker extends ReadWalker<Integer,Integer> {
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of reads aligned by this map (aka 1).
*/
@Override
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
//logger.info(String.format("examining read %s", read.getReadName()));
byte[] bases = read.getReadBases();

View File

@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.File;
@ -92,12 +93,13 @@ public class AlignmentWalker extends ReadWalker<Integer,Integer> {
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
SAMRecord alignedRead = aligner.align(read,header);
out.addAlignment(alignedRead);
return 1;

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.alignment;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
@ -34,6 +33,7 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.Iterator;
@ -79,12 +79,13 @@ public class CountBestAlignmentsWalker extends ReadWalker<Integer,Integer> {
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
if(alignmentIterator.hasNext()) {
int numAlignments = alignmentIterator.next().length;

View File

@ -25,6 +25,9 @@
package org.broadinstitute.sting.analyzecovariates;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.CommandLineProgram;
@ -33,14 +36,16 @@ import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.io.Resource;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
@ -71,15 +76,13 @@ import java.util.regex.Pattern;
* </ul>
*
* <p>
* NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options
* must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R,
* not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using
* this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball.
* For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout.
* NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
* See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
* <a target="gatkwiki" href="http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration"
* >http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration</a>
*
* <h2>Input</h2>
* <p>
@ -91,7 +94,6 @@ import java.util.regex.Pattern;
* java -Xmx4g -jar AnalyzeCovariates.jar \
* -recalFile /path/to/recal.table.csv \
* -outputDir /path/to/output_dir/ \
* -resources resources/ \
* -ignoreQ 5
* </pre>
*
@ -101,6 +103,11 @@ import java.util.regex.Pattern;
groupName = "AnalyzeCovariates",
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class);
private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R";
private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R";
private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R";
/////////////////////////////
// Command Line Arguments
@ -114,11 +121,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
private String OUTPUT_DIR = "analyzeCovariates/";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.12.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "public/R/";
private File OUTPUT_DIR = new File("analyzeCovariates");
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
@ -154,29 +157,26 @@ public class AnalyzeCovariates extends CommandLineProgram {
protected int execute() {
// create the output directory where all the data tables and plots will go
try {
Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
} catch (IOException e) {
System.out.println("Couldn't create directory: " + OUTPUT_DIR);
System.out.println("User is responsible for making sure the output directory exists.");
}
if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs())
throw new UserException.BadArgumentValue("--output_dir/-outDir", "Unable to create output directory: " + OUTPUT_DIR);
if (!RScriptExecutor.RSCRIPT_EXISTS)
Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated.");
// initialize all the data from the csv file and allocate the list of covariates
System.out.println("Reading in input csv file...");
logger.info("Reading in input csv file...");
initializeData();
System.out.println("...Done!");
logger.info("...Done!");
// output data tables for Rscript to read in
System.out.println("Writing out intermediate tables for R...");
logger.info("Writing out intermediate tables for R...");
writeDataTables();
System.out.println("...Done!");
logger.info("...Done!");
// perform the analysis using Rscript and output the plots
System.out.println("Calling analysis R scripts and writing out figures...");
logger.info("Calling analysis R scripts and writing out figures...");
callRScripts();
System.out.println("...Done!");
logger.info("...Done!");
return 0;
}
@ -287,37 +287,40 @@ public class AnalyzeCovariates extends CommandLineProgram {
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
String readGroup = readGroupKey.toString();
RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey);
System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" );
System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
logger.info(String.format(
"Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f",
readGroup, readGroupDatum.getNumObservations(),
readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
// Create a PrintStream
PrintStream output = null;
File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
PrintStream output;
try {
output = new PrintStream(new FileOutputStream(OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"));
} catch (FileNotFoundException e) {
System.err.println("Can't create file: " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
System.exit(-1);
output = new PrintStream(FileUtils.openOutputStream(outputFile));
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
// Output the header
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
try {
// Output the header
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) {
output.print( covariateKey.toString() + "\t" ); // Covariate
RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
output.println( thisDatum.getNumObservations() ); // nBases
for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) {
output.print( covariateKey.toString() + "\t" ); // Covariate
RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
output.println( thisDatum.getNumObservations() ); // nBases
}
} finally {
// Close the PrintStream
IOUtils.closeQuietly(output);
}
// Close the PrintStream
output.close();
}
} else {
break;
@ -327,10 +330,6 @@ public class AnalyzeCovariates extends CommandLineProgram {
}
private void callRScripts() {
RScriptExecutor.RScriptArgumentCollection argumentCollection =
new RScriptExecutor.RScriptArgumentCollection(PATH_TO_RSCRIPT, Arrays.asList(PATH_TO_RESOURCES));
RScriptExecutor executor = new RScriptExecutor(argumentCollection, true);
int numReadGroups = 0;
// for each read group
@ -338,23 +337,32 @@ public class AnalyzeCovariates extends CommandLineProgram {
if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {
String readGroup = readGroupKey.toString();
System.out.println("Analyzing read group: " + readGroup);
logger.info("Analyzing read group: " + readGroup);
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
final String outputFilename = OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat";
final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
if (DO_INDEL_QUALITY) {
executor.callRScripts("plot_indelQuality.R", outputFilename,
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class));
// The second argument is the name of the covariate in order to make the plots look nice
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
executor.exec();
} else {
if( iii == 1 ) {
// Analyze reported quality
executor.callRScripts("plot_residualError_QualityScoreCovariate.R", outputFilename,
IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE, AnalyzeCovariates.class));
// The second argument is the Q scores that should be turned pink in the plot because they were ignored
executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE);
executor.exec();
} else { // Analyze all other covariates
executor.callRScripts("plot_residualError_OtherCovariate.R", outputFilename,
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class));
// The second argument is the name of the covariate in order to make the plots look nice
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
executor.exec();
}
}
}

View File

@ -46,7 +46,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
/**
* Maps indices of command line arguments to values paired with that argument.
*/
public final SortedMap<Integer,List<String>> indices = new TreeMap<Integer,List<String>>();
public final SortedMap<ArgumentMatchSite,List<String>> sites = new TreeMap<ArgumentMatchSite,List<String>>();
/**
* An ordered, freeform collection of tags.
@ -72,32 +72,32 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
}
/**
* A simple way of indicating that an argument with the given label and definition exists at this index.
* A simple way of indicating that an argument with the given label and definition exists at this site.
* @param label Label of the argument match. Must not be null.
* @param definition The associated definition, if one exists. May be null.
* @param index Position of the argument. Must not be null.
* @param site Position of the argument. Must not be null.
* @param tags ordered freeform text tags associated with this argument.
*/
public ArgumentMatch(final String label, final ArgumentDefinition definition, final int index, final Tags tags) {
this( label, definition, index, null, tags );
public ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final Tags tags) {
this( label, definition, site, null, tags );
}
/**
* A simple way of indicating that an argument with the given label and definition exists at this index.
* A simple way of indicating that an argument with the given label and definition exists at this site.
* @param label Label of the argument match. Must not be null.
* @param definition The associated definition, if one exists. May be null.
* @param index Position of the argument. Must not be null.
* @param site Position of the argument. Must not be null.
* @param value Value for the argument at this position.
* @param tags ordered freeform text tags associated with this argument.
*/
private ArgumentMatch(final String label, final ArgumentDefinition definition, final int index, final String value, final Tags tags) {
private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final String value, final Tags tags) {
this.label = label;
this.definition = definition;
ArrayList<String> values = new ArrayList<String>();
if( value != null )
values.add(value);
indices.put(index,values );
sites.put(site,values );
this.tags = tags;
}
@ -117,7 +117,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
ArgumentMatch otherArgumentMatch = (ArgumentMatch)other;
return this.definition.equals(otherArgumentMatch.definition) &&
this.label.equals(otherArgumentMatch.label) &&
this.indices.equals(otherArgumentMatch.indices) &&
this.sites.equals(otherArgumentMatch.sites) &&
this.tags.equals(otherArgumentMatch.tags);
}
@ -129,16 +129,17 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* @param key Key which specifies the transform.
* @return A variant of this ArgumentMatch with all keys transformed.
*/
@SuppressWarnings("unchecked")
ArgumentMatch transform(Multiplexer multiplexer, Object key) {
SortedMap<Integer,List<String>> newIndices = new TreeMap<Integer,List<String>>();
for(Map.Entry<Integer,List<String>> index: indices.entrySet()) {
SortedMap<ArgumentMatchSite,List<String>> newIndices = new TreeMap<ArgumentMatchSite,List<String>>();
for(Map.Entry<ArgumentMatchSite,List<String>> site: sites.entrySet()) {
List<String> newEntries = new ArrayList<String>();
for(String entry: index.getValue())
for(String entry: site.getValue())
newEntries.add(multiplexer.transformArgument(key,entry));
newIndices.put(index.getKey(),newEntries);
newIndices.put(site.getKey(),newEntries);
}
ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition);
newArgumentMatch.indices.putAll(newIndices);
newArgumentMatch.sites.putAll(newIndices);
return newArgumentMatch;
}
@ -157,9 +158,9 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
public Iterator<ArgumentMatch> iterator() {
return new Iterator<ArgumentMatch>() {
/**
* Iterate over each the available index.
* Iterate over each the available site.
*/
private Iterator<Integer> indexIterator = null;
private Iterator<ArgumentMatchSite> siteIterator = null;
/**
* Iterate over each available token.
@ -167,9 +168,9 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
private Iterator<String> tokenIterator = null;
/**
* The next index to return. Null if none remain.
* The next site to return. Null if none remain.
*/
Integer nextIndex = null;
ArgumentMatchSite nextSite = null;
/**
* The next token to return. Null if none remain.
@ -177,7 +178,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
String nextToken = null;
{
indexIterator = indices.keySet().iterator();
siteIterator = sites.keySet().iterator();
prepareNext();
}
@ -186,7 +187,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* @return True if there's another token waiting in the wings. False otherwise.
*/
public boolean hasNext() {
return nextToken != null;
return nextToken != null;
}
/**
@ -194,32 +195,32 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* @return The next ArgumentMatch in the series. Should never be null.
*/
public ArgumentMatch next() {
if( nextIndex == null || nextToken == null )
if( nextSite == null || nextToken == null )
throw new IllegalStateException( "No more ArgumentMatches are available" );
ArgumentMatch match = new ArgumentMatch( label, definition, nextIndex, nextToken, tags );
ArgumentMatch match = new ArgumentMatch( label, definition, nextSite, nextToken, tags );
prepareNext();
return match;
}
/**
* Initialize the next ArgumentMatch to return. If no ArgumentMatches are available,
* initialize nextIndex / nextToken to null.
* initialize nextSite / nextToken to null.
*/
private void prepareNext() {
if( tokenIterator != null && tokenIterator.hasNext() ) {
nextToken = tokenIterator.next();
}
else {
nextIndex = null;
nextSite = null;
nextToken = null;
// Do a nested loop. While more data is present in the inner loop, grab that data.
// Otherwise, troll the outer iterator looking for more data.
while( indexIterator.hasNext() ) {
nextIndex = indexIterator.next();
if( indices.get(nextIndex) != null ) {
tokenIterator = indices.get(nextIndex).iterator();
while( siteIterator.hasNext() ) {
nextSite = siteIterator.next();
if( sites.get(nextSite) != null ) {
tokenIterator = sites.get(nextSite).iterator();
if( tokenIterator.hasNext() ) {
nextToken = tokenIterator.next();
break;
@ -245,29 +246,29 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* @param other The other match to merge into.
*/
public void mergeInto( ArgumentMatch other ) {
indices.putAll(other.indices);
sites.putAll(other.sites);
}
/**
* Associate a value with this merge maapping.
* @param index index of the command-line argument to which this value is mated.
* @param site site of the command-line argument to which this value is mated.
* @param value Text representation of value to add.
*/
public void addValue( int index, String value ) {
if( !indices.containsKey(index) || indices.get(index) == null )
indices.put(index, new ArrayList<String>() );
indices.get(index).add(value);
public void addValue( ArgumentMatchSite site, String value ) {
if( !sites.containsKey(site) || sites.get(site) == null )
sites.put(site, new ArrayList<String>() );
sites.get(site).add(value);
}
/**
* Does this argument already have a value at the given site?
* Arguments are only allowed to be single-valued per site, and
* flags aren't allowed a value at all.
* @param index Index at which to check for values.
* @param site Site at which to check for values.
* @return True if the argument has a value at the given site. False otherwise.
*/
public boolean hasValueAtSite( int index ) {
return (indices.get(index) != null && indices.get(index).size() >= 1) || isArgumentFlag();
public boolean hasValueAtSite( ArgumentMatchSite site ) {
return (sites.get(site) != null && sites.get(site).size() >= 1) || isArgumentFlag();
}
/**
@ -276,9 +277,9 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
*/
public List<String> values() {
List<String> values = new ArrayList<String>();
for( int index: indices.keySet() ) {
if( indices.get(index) != null )
values.addAll(indices.get(index));
for( ArgumentMatchSite site: sites.keySet() ) {
if( sites.get(site) != null )
values.addAll(sites.get(site));
}
return values;
}

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
/**
* Which source and the index within the source where an argument match was found.
*/
public class ArgumentMatchSite implements Comparable<ArgumentMatchSite> {
private final ArgumentMatchSource source;
private final int index;
public ArgumentMatchSite(ArgumentMatchSource source, int index) {
this.source = source;
this.index = index;
}
public ArgumentMatchSource getSource() {
return source;
}
public int getIndex() {
return index;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ArgumentMatchSite that = (ArgumentMatchSite) o;
return (index == that.index) && (source == null ? that.source == null : source.equals(that.source));
}
@Override
public int hashCode() {
int result = source != null ? source.hashCode() : 0;
// Generated by intellij. No other special reason to this implementation. -ks
result = 31 * result + index;
return result;
}
@Override
public int compareTo(ArgumentMatchSite that) {
int comp = this.source.compareTo(that.source);
if (comp != 0)
return comp;
// Both files are the same.
if (this.index == that.index)
return 0;
return this.index < that.index ? -1 : 1;
}
}

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
import java.io.File;
/**
* Where an argument match originated, via the commandline or a file.
*/
public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null);
private final ArgumentMatchSourceType type;
private final File file;
/**
* Creates an argument match source from the specified file.
* @param file File specifying the arguments. Must not be null.
*/
public ArgumentMatchSource(File file) {
this(ArgumentMatchSourceType.File, file);
}
private ArgumentMatchSource(ArgumentMatchSourceType type, File file) {
if (type == ArgumentMatchSourceType.File && file == null)
throw new IllegalArgumentException("An argument match source of type File cannot have a null file.");
this.type = type;
this.file = file;
}
public ArgumentMatchSourceType getType() {
return type;
}
public File getFile() {
return file;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ArgumentMatchSource that = (ArgumentMatchSource) o;
return (type == that.type) && (file == null ? that.file == null : file.equals(that.file));
}
@Override
public int hashCode() {
int result = type != null ? type.hashCode() : 0;
result = 31 * result + (file != null ? file.hashCode() : 0);
return result;
}
/**
* Compares two sources, putting the command line first, then files.
*/
@Override
public int compareTo(ArgumentMatchSource that) {
int comp = this.type.compareTo(that.type);
if (comp != 0)
return comp;
File f1 = this.file;
File f2 = that.file;
if ((f1 == null) ^ (f2 == null)) {
// If one of the files is null and the other is not
// put the null file first
return f1 == null ? -1 : 1;
}
return f1 == null ? 0 : f1.compareTo(f2);
}
}

View File

@ -0,0 +1,32 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
/**
* Type of where an argument match originated, via the commandline or a file.
*/
public enum ArgumentMatchSourceType {
CommandLine, File
}

View File

@ -37,7 +37,7 @@ public class ArgumentMatches implements Iterable<ArgumentMatch> {
* Collection matches from argument definition to argument value.
* Package protected access is deliberate.
*/
Map<Integer,ArgumentMatch> argumentMatches = new TreeMap<Integer,ArgumentMatch>();
Map<ArgumentMatchSite,ArgumentMatch> argumentMatches = new TreeMap<ArgumentMatchSite,ArgumentMatch>();
/**
* Provide a place to put command-line argument values that don't seem to belong to
@ -80,7 +80,7 @@ public class ArgumentMatches implements Iterable<ArgumentMatch> {
* @param site Site at which to check.
* @return True if the site has a match. False otherwise.
*/
boolean hasMatch( int site ) {
boolean hasMatch( ArgumentMatchSite site ) {
return argumentMatches.containsKey( site );
}
@ -90,7 +90,7 @@ public class ArgumentMatches implements Iterable<ArgumentMatch> {
* @return The match present at the given site.
* @throws IllegalArgumentException if site does not contain a match.
*/
ArgumentMatch getMatch( int site ) {
ArgumentMatch getMatch( ArgumentMatchSite site ) {
if( !argumentMatches.containsKey(site) )
throw new IllegalArgumentException( "Site does not contain an argument: " + site );
return argumentMatches.get(site);
@ -107,6 +107,7 @@ public class ArgumentMatches implements Iterable<ArgumentMatch> {
/**
* Return all argument matches of this source.
* @param parsingEngine Parsing engine.
* @param argumentSource Argument source to match.
* @return List of all matches.
*/
@ -167,6 +168,7 @@ public class ArgumentMatches implements Iterable<ArgumentMatch> {
* TODO: Generify this.
* @param multiplexer Multiplexer that controls the transformation process.
* @param key Key which specifies the transform.
* @return new argument matches.
*/
ArgumentMatches transform(Multiplexer multiplexer, Object key) {
ArgumentMatches newArgumentMatches = new ArgumentMatches();
@ -187,15 +189,15 @@ public class ArgumentMatches implements Iterable<ArgumentMatch> {
for( ArgumentMatch argumentMatch: getUniqueMatches() ) {
if( argumentMatch.definition == match.definition && argumentMatch.tags.equals(match.tags) ) {
argumentMatch.mergeInto( match );
for( int index: match.indices.keySet() )
argumentMatches.put( index, argumentMatch );
for( ArgumentMatchSite site: match.sites.keySet() )
argumentMatches.put( site, argumentMatch );
definitionExists = true;
}
}
if( !definitionExists ) {
for( int index: match.indices.keySet() )
argumentMatches.put( index, match );
for( ArgumentMatchSite site: match.sites.keySet() )
argumentMatches.put( site, match );
}
}

View File

@ -336,6 +336,28 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
return parse(parsingEngine, source, type, matches, false);
}
/**
* The actual argument parsing method.
*
* IMPORTANT NOTE: the createIntervalBinding argument is a bit of a hack, but after discussions with SE we've decided
* that it's the best way to proceed for now. IntervalBindings can either be proper RodBindings (hence the use of
* this parse() method) or can be Strings (representing raw intervals or the files containing them). If createIntervalBinding
* is true, we do not call parsingEngine.addRodBinding() because we don't want walkers to assume that these are the
* usual set of RodBindings. It also allows us in the future to be smart about tagging rods as intervals. One other
* side point is that we want to continue to allow the usage of non-Feature intervals so that users can theoretically
* continue to input them out of order (whereas Tribble Features are ordered).
*
* @param parsingEngine parsing engine
* @param source source
* @param type type to check
* @param matches matches
* @param createIntervalBinding should we attempt to create an IntervalBinding instead of a RodBinding?
* @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding.
*/
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches, boolean createIntervalBinding) {
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
String value = getArgumentValue( defaultDefinition, matches );
Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
@ -348,7 +370,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
if ( tags.getPositionalTags().size() > 2 ) {
throw new UserException.CommandLineException(
String.format("Unexpected number of positional tags for argument %s : %s. " +
"Rod bindings only suport -X:type and -X:name,type argument styles",
"Rod bindings only support -X:type and -X:name,type argument styles",
value, source.field.getName()));
} if ( tags.getPositionalTags().size() == 2 ) {
// -X:name,type style
@ -378,7 +400,12 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
}
if ( tribbleType == null )
if ( tribbleType == null ) {
// IntervalBindings allow streaming conversion of Strings
if ( createIntervalBinding ) {
return new IntervalBinding(value);
}
if ( ! file.exists() ) {
throw new UserException.CouldNotReadInputFile(file, "file does not exist");
} else if ( ! file.canRead() || ! file.isFile() ) {
@ -389,13 +416,20 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
manager.userFriendlyListOfAvailableFeatures(parameterType)));
}
}
}
}
Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
parsingEngine.addTags(result,tags);
parsingEngine.addRodBinding(result);
Object result;
if ( createIntervalBinding ) {
result = ctor.newInstance(parameterType, name, value, tribbleType, tags);
} else {
RodBinding rbind = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
parsingEngine.addTags(rbind, tags);
parsingEngine.addRodBinding(rbind);
result = rbind;
}
return result;
} catch (InvocationTargetException e) {
throw new UserException.CommandLineException(
@ -409,6 +443,39 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
}
/**
* Parser for RodBinding objects
*/
class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
/**
* We only want IntervalBinding class objects
* @param type The type to check.
* @return true if the provided class is an IntervalBinding.class
*/
@Override
public boolean supports( Class type ) {
return isIntervalBinding(type);
}
public static boolean isIntervalBinding( Class type ) {
return IntervalBinding.class.isAssignableFrom(type);
}
/**
* See note from RodBindingArgumentTypeDescriptor.parse().
*
* @param parsingEngine parsing engine
* @param source source
* @param type type to check
* @param matches matches
* @return the IntervalBinding object.
*/
@Override
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
return new RodBindingArgumentTypeDescriptor().parse(parsingEngine, source, type, matches, true);
}
}
/**
* Parse simple argument types: java primitives, wrapper classes, and anything that has
* a simple String constructor.
@ -416,7 +483,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public boolean supports( Class type ) {
if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) ) return false;
if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) || IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ) return false;
if ( type.isPrimitive() ) return true;
if ( type.isEnum() ) return true;
if ( primitiveToWrapperMap.containsValue(type) ) return true;

View File

@ -35,10 +35,7 @@ import org.broadinstitute.sting.utils.help.ApplicationDetails;
import org.broadinstitute.sting.utils.help.HelpFormatter;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Locale;
import java.util.*;
public abstract class CommandLineProgram {
@ -155,6 +152,7 @@ public abstract class CommandLineProgram {
*
* @param clp the command line program to execute
* @param args the command line arguments passed in
* @param dryRun dry run
* @throws Exception when an exception occurs
*/
@SuppressWarnings("unchecked")
@ -176,6 +174,8 @@ public abstract class CommandLineProgram {
ParsingEngine parser = clp.parser = new ParsingEngine(clp);
parser.addArgumentSource(clp.getClass());
Map<ArgumentMatchSource, List<String>> parsedArgs;
// process the args
if (clp.canAddArgumentsDynamically()) {
// if the command-line program can toss in extra args, fetch them and reparse the arguments.
@ -196,14 +196,14 @@ public abstract class CommandLineProgram {
Class[] argumentSources = clp.getArgumentSources();
for (Class argumentSource : argumentSources)
parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource);
parser.parse(args);
parsedArgs = parser.parse(args);
if (isHelpPresent(parser))
printHelpAndExit(clp, parser);
if ( ! dryRun ) parser.validate();
} else {
parser.parse(args);
parsedArgs = parser.parse(args);
if ( ! dryRun ) {
if (isHelpPresent(parser))
@ -230,7 +230,7 @@ public abstract class CommandLineProgram {
}
// regardless of what happens next, generate the header information
HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args);
HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs);
// call the execute
CommandLineProgram.result = clp.execute();

View File

@ -0,0 +1,108 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
import com.google.java.contract.Requires;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
/**
* An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string.
*
* The IntervalBinding<T> is a formal GATK argument that bridges between a walker and
* the engine to construct intervals for traversal at runtime. The IntervalBinding can
* either be a RodBinding<T>, a string of one or more intervals, or a file with interval strings.
* The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
*
* Note that this class is immutable.
*/
public final class IntervalBinding<T extends Feature> {
private RodBinding<T> featureIntervals;
private String stringIntervals;
@Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"})
public IntervalBinding(Class<T> type, final String rawName, final String source, final String tribbleType, final Tags tags) {
featureIntervals = new RodBinding<T>(type, rawName, source, tribbleType, tags);
}
@Requires({"intervalArgument != null"})
public IntervalBinding(String intervalArgument) {
stringIntervals = intervalArgument;
}
public String getSource() {
if ( featureIntervals != null )
return featureIntervals.getSource();
return stringIntervals;
}
public List<GenomeLoc> getIntervals(GenomeAnalysisEngine toolkit) {
List<GenomeLoc> intervals;
if ( featureIntervals != null ) {
intervals = new ArrayList<GenomeLoc>();
//RMDTrackBuilder builder = new RMDTrackBuilder(toolkit.getReferenceDataSource().getReference().getSequenceDictionary(),
// toolkit.getGenomeLocParser(),
// toolkit.getArguments().unsafe);
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
if ( codec instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
try {
FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
AsciiLineReader lineReader = new AsciiLineReader(fis);
codec.readHeader(lineReader);
String line = lineReader.readLine();
while ( line != null ) {
intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line)));
line = lineReader.readLine();
}
} catch (IOException e) {
throw new UserException("Problem reading the interval file " + featureIntervals.getSource() + "; " + e.getMessage());
}
} else {
intervals = IntervalUtils.parseIntervalArguments(toolkit.getGenomeLocParser(), stringIntervals);
}
return intervals;
}
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.commandline;
import com.google.java.contract.Requires;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.JVMUtils;
@ -35,6 +36,8 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.ApplicationDetails;
import org.broadinstitute.sting.utils.help.HelpFormatter;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.*;
@ -75,6 +78,7 @@ public class ParsingEngine {
* The type of set used must be ordered (but not necessarily sorted).
*/
private static final Set<ArgumentTypeDescriptor> STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet<ArgumentTypeDescriptor>( Arrays.asList(new SimpleArgumentTypeDescriptor(),
new IntervalBindingArgumentTypeDescriptor(),
new RodBindingArgumentTypeDescriptor(),
new CompoundArgumentTypeDescriptor(),
new MultiplexArgumentTypeDescriptor()) );
@ -100,6 +104,8 @@ public class ParsingEngine {
if(clp != null)
argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors());
argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS);
addArgumentSource(ParsingEngineArgumentFiles.class);
}
/**
@ -148,21 +154,43 @@ public class ParsingEngine {
* command-line arguments to the arguments that are actually
* required.
* @param tokens Tokens passed on the command line.
* @return The parsed arguments by file.
*/
public void parse( String[] tokens ) {
public SortedMap<ArgumentMatchSource, List<String>> parse( String[] tokens ) {
argumentMatches = new ArgumentMatches();
SortedMap<ArgumentMatchSource, List<String>> parsedArgs = new TreeMap<ArgumentMatchSource, List<String>>();
int lastArgumentMatchSite = -1;
List<String> cmdLineTokens = Arrays.asList(tokens);
parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs);
for( int i = 0; i < tokens.length; i++ ) {
String token = tokens[i];
ParsingEngineArgumentFiles argumentFiles = new ParsingEngineArgumentFiles();
// Load the arguments ONLY into the argument files.
// Validation may optionally run on the rest of the arguments.
loadArgumentsIntoObject(argumentFiles);
for (File file: argumentFiles.files) {
List<String> fileTokens = getArguments(file);
parse(new ArgumentMatchSource(file), fileTokens, argumentMatches, parsedArgs);
}
return parsedArgs;
}
private void parse(ArgumentMatchSource matchSource, List<String> tokens,
ArgumentMatches argumentMatches, SortedMap<ArgumentMatchSource, List<String>> parsedArgs) {
ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1);
int i = 0;
for (String token: tokens) {
// If the token is of argument form, parse it into its own argument match.
// Otherwise, pair it with the most recently used argument discovered.
ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i);
if( isArgumentForm(token) ) {
ArgumentMatch argumentMatch = parseArgument( token, i );
ArgumentMatch argumentMatch = parseArgument( token, site );
if( argumentMatch != null ) {
argumentMatches.mergeInto( argumentMatch );
lastArgumentMatchSite = i;
lastArgumentMatchSite = site;
}
}
else {
@ -170,10 +198,31 @@ public class ParsingEngine {
!argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, token );
else
argumentMatches.MissingArgument.addValue( i, token );
argumentMatches.MissingArgument.addValue( site, token );
}
i++;
}
parsedArgs.put(matchSource, tokens);
}
private List<String> getArguments(File file) {
try {
if (file.getAbsolutePath().endsWith(".list")) {
return getListArguments(file);
}
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, e);
}
throw new UserException.CouldNotReadInputFile(file, "file extension is not .list");
}
private List<String> getListArguments(File file) throws IOException {
ArrayList<String> argsList = new ArrayList<String>();
for (String line: FileUtils.readLines(file))
argsList.addAll(Arrays.asList(Utils.escapeExpressions(line)));
return argsList;
}
public enum ValidationType { MissingRequiredArgument,
@ -494,7 +543,7 @@ public class ParsingEngine {
* @param position The position of the token in question.
* @return ArgumentMatch associated with this token, or null if no match exists.
*/
private ArgumentMatch parseArgument( String token, int position ) {
private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) {
if( !isArgumentForm(token) )
throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token );
@ -579,9 +628,21 @@ class UnmatchedArgumentException extends ArgumentException {
private static String formatArguments( ArgumentMatch invalidValues ) {
StringBuilder sb = new StringBuilder();
for( int index: invalidValues.indices.keySet() )
for( String value: invalidValues.indices.get(index) ) {
sb.append( String.format("%nInvalid argument value '%s' at position %d.", value, index) );
for( ArgumentMatchSite site: invalidValues.sites.keySet() )
for( String value: invalidValues.sites.get(site) ) {
switch (site.getSource().getType()) {
case CommandLine:
sb.append( String.format("%nInvalid argument value '%s' at position %d.",
value, site.getIndex()) );
break;
case File:
sb.append( String.format("%nInvalid argument value '%s' in file %s at position %d.",
value, site.getSource().getFile().getAbsolutePath(), site.getIndex()) );
break;
default:
throw new RuntimeException( String.format("Unexpected argument match source type: %s",
site.getSource().getType()));
}
if(value != null && Utils.dupString(' ',value.length()).equals(value))
sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace.");
}
@ -634,4 +695,13 @@ class UnknownEnumeratedValueException extends ArgumentException {
private static String formatArguments(ArgumentDefinition definition, String argumentPassed) {
return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions));
}
}
}
/**
* Container class to store the list of argument files.
* The files will be parsed after the command line arguments.
*/
class ParsingEngineArgumentFiles {
@Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
public List<File> files = new ArrayList<File>();
}

View File

@ -68,7 +68,7 @@ public abstract class ParsingMethod {
* @return An argument match. Definition field will be populated if a match was found or
* empty if no appropriate definition could be found.
*/
public ArgumentMatch match( ArgumentDefinitions definitions, String token, int position ) {
public ArgumentMatch match( ArgumentDefinitions definitions, String token, ArgumentMatchSite position ) {
// If the argument is valid, parse out the argument.
Matcher matcher = pattern.matcher(token);
@ -102,9 +102,7 @@ public abstract class ParsingMethod {
// Try to find a matching argument. If found, label that as the match. If not found, add the argument
// with a null definition.
ArgumentMatch argumentMatch = new ArgumentMatch(argument,argumentDefinition,position,tags);
return argumentMatch;
return new ArgumentMatch(argument,argumentDefinition,position,tags);
}
/**

View File

@ -28,34 +28,30 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.util.*;
@ -92,7 +88,7 @@ public class GenomeAnalysisEngine {
/**
* Accessor for sample metadata
*/
private SampleDataSource sampleDataSource = null;
private SampleDB sampleDB = null;
/**
* Accessor for sharded reference-ordered data.
@ -206,6 +202,9 @@ public class GenomeAnalysisEngine {
// Prepare the data for traversal.
initializeDataSources();
// initialize sampleDB
initializeSampleDB();
// initialize and validate the interval list
initializeIntervals();
validateSuppliedIntervals();
@ -222,12 +221,12 @@ public class GenomeAnalysisEngine {
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
// execute the microscheduler, storing the results
Object result = microScheduler.execute(this.walker, shardStrategy);
return microScheduler.execute(this.walker, shardStrategy);
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
return result;
//return result;
}
/**
@ -259,13 +258,12 @@ public class GenomeAnalysisEngine {
* @return A collection of available filters.
*/
public Collection<ReadFilter> createFilters() {
Set<ReadFilter> filters = new HashSet<ReadFilter>();
filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager()));
final List<ReadFilter> filters = WalkerManager.getReadFilters(walker,this.getFilterManager());
if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0)
filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList));
for(String filterName: this.getArguments().readFilters)
for(final String filterName: this.getArguments().readFilters)
filters.add(this.getFilterManager().createByName(filterName));
return Collections.unmodifiableSet(filters);
return Collections.unmodifiableList(filters);
}
/**
@ -299,10 +297,14 @@ public class GenomeAnalysisEngine {
else if(WalkerManager.getDownsamplingMethod(walker) != null)
method = WalkerManager.getDownsamplingMethod(walker);
else
method = argCollection.getDefaultDownsamplingMethod();
method = GATKArgumentCollection.getDefaultDownsamplingMethod();
return method;
}
protected void setDownsamplingMethod(DownsamplingMethod method) {
argCollection.setDownsamplingMethod(method);
}
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
@ -381,18 +383,18 @@ public class GenomeAnalysisEngine {
// If intervals is non-null and empty at this point, it means that the list of intervals to process
// was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since
// this was very likely unintentional, the user should be informed of this. Note that this is different
// from the case where intervals == null, which indicates either that there were no interval arguments,
// or that -L all was specified.
// from the case where intervals == null, which indicates that there were no interval arguments.
if ( intervals != null && intervals.isEmpty() ) {
throw new ArgumentException("The given combination of -L and -XL options results in an empty set. " +
"No intervals to process.");
logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process.");
}
}
/**
* Get the sharding strategy given a driving data source.
*
* @param readsDataSource readsDataSource
* @param drivingDataSource Data on which to shard.
* @param intervals intervals
* @return the sharding strategy
*/
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
@ -429,7 +431,7 @@ public class GenomeAnalysisEngine {
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
}
ShardStrategy shardStrategy = null;
ShardStrategy shardStrategy;
ShardStrategyFactory.SHATTER_STRATEGY shardType;
long SHARD_SIZE = 100000L;
@ -438,6 +440,8 @@ public class GenomeAnalysisEngine {
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
if (intervals != null && !intervals.isEmpty()) {
if (readsDataSource == null)
throw new IllegalArgumentException("readsDataSource is null");
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
@ -501,7 +505,8 @@ public class GenomeAnalysisEngine {
*/
private void initializeTempDirectory() {
File tempDir = new File(System.getProperty("java.io.tmpdir"));
tempDir.mkdirs();
if (!tempDir.exists() && !tempDir.mkdirs())
throw new UserException.BadTmpDir("Unable to create directory");
}
/**
@ -566,34 +571,23 @@ public class GenomeAnalysisEngine {
protected void initializeIntervals() {
// return if no interval arguments at all
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
return;
// if '-L all' was specified, verify that it was the only -L specified and return if so.
if(argCollection.intervals != null) {
for(String interval: argCollection.intervals) {
if(interval.trim().equals("all")) {
if(argCollection.intervals.size() > 1)
throw new UserException("'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals.");
// '-L all' was specified and seems valid. Return.
return;
}
}
}
// Note that the use of '-L all' is no longer supported.
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
loadIntervals(argCollection.intervals, IntervalUtils.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging)));
loadIntervals(argCollection.intervals, argCollection.intervalSetRule));
// if no exclude arguments, can return parseIntervalArguments directly
if (argCollection.excludeIntervals == null)
if ( argCollection.excludeIntervals == null )
intervals = includeSortedSet;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, null);
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, IntervalSetRule.UNION);
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
@ -608,47 +602,26 @@ public class GenomeAnalysisEngine {
/**
* Loads the intervals relevant to the current execution
* @param argList String representation of arguments; might include 'all', filenames, intervals in samtools
* notation, or a combination of the above
* @param rodIntervals a list of ROD intervals to add to the returned set. Can be empty or null.
* @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above
* @param rule interval merging rule
* @return A sorted, merged list of all intervals specified in this arg list.
*/
protected GenomeLocSortedSet loadIntervals( List<String> argList, List<GenomeLoc> rodIntervals ) {
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
boolean allowEmptyIntervalList = (argCollection.unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST ||
argCollection.unsafe == ValidationExclusion.TYPE.ALL);
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
for ( IntervalBinding intervalBinding : argList ) {
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
List<GenomeLoc> nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList);
List<GenomeLoc> allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule);
if ( intervals.isEmpty() ) {
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
}
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule);
}
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging);
}
/**
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
* @return ROD intervals as GenomeLocs
*/
private List<GenomeLoc> getRODIntervals() {
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources);
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
if (rodNames != null && argCollection.RODToInterval != null) {
String rodName = argCollection.RODToInterval;
// check to make sure we have a rod of that name
if (!rodNames.containsKey(rodName))
throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
for (String str : rodNames.keySet())
if (str.equals(rodName)) {
logger.info("Adding interval list from track (ROD) named " + rodName);
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str));
ret.addAll(intervalGenerator.toGenomeLocList());
}
}
return ret;
}
/**
* Add additional, externally managed IO streams for inputs.
*
@ -692,12 +665,22 @@ public class GenomeAnalysisEngine {
for (ReadFilter filter : filters)
filter.initialize(this);
sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe);
}
/**
* Entry-point function to initialize the samples database from input data and pedigree arguments
*/
private void initializeSampleDB() {
SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType);
sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader());
sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this));
sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles);
sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings);
sampleDB = sampleDBBuilder.getFinalSampleDB();
}
/**
* Gets a unique identifier for the reader sourcing this read.
* @param read Read to examine.
@ -716,106 +699,13 @@ public class GenomeAnalysisEngine {
return getReadsDataSource().getSAMFile(id);
}
/**
* Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
* returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
* found in the corresponding bam file.
*
* @return Sets of samples in the merged input SAM stream, grouped by readers
*/
public List<Set<String>> getSamplesByReaders() {
Collection<SAMReaderID> readers = getReadsDataSource().getReaderIDs();
List<Set<String>> sample_sets = new ArrayList<Set<String>>(readers.size());
for (SAMReaderID r : readers) {
Set<String> samples = new HashSet<String>(1);
sample_sets.add(samples);
for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) {
samples.add(g.getSample());
}
}
return sample_sets;
}
/**
* Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
* returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
* found in the corresponding bam file.
*
* @return Sets of libraries present in the (merged) input SAM stream, grouped by readers
*/
public List<Set<String>> getLibrariesByReaders() {
Collection<SAMReaderID> readers = getReadsDataSource().getReaderIDs();
List<Set<String>> lib_sets = new ArrayList<Set<String>>(readers.size());
for (SAMReaderID r : readers) {
Set<String> libs = new HashSet<String>(2);
lib_sets.add(libs);
for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) {
libs.add(g.getLibrary());
}
}
return lib_sets;
}
/**
* **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD ****
*
* Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
* returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
* (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
*
* @return sets of (merged) read group ids in order of input bams
*/
public List<Set<String>> getMergedReadGroupsByReaders() {
Collection<SAMReaderID> readers = getReadsDataSource().getReaderIDs();
List<Set<String>> rg_sets = new ArrayList<Set<String>>(readers.size());
for (SAMReaderID r : readers) {
Set<String> groups = new HashSet<String>(5);
rg_sets.add(groups);
for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) {
if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
// use HeaderMerger to translate original read group id from the reader into the read group id in the
// merged stream, and save that remapped read group id to associate it with specific reader
groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId()));
} else {
// otherwise, pass through the unmapped read groups since this is what Picard does as well
groups.add(g.getReadGroupId());
}
}
}
return rg_sets;
}
/**
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available).
*
* @param reads Reads data source.
* @param reference Reference data source.
* @param rods a collection of the reference ordered data tracks
* @param manager manager
*/
private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, RMDTrackBuilder manager) {
if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null )
@ -844,15 +734,22 @@ public class GenomeAnalysisEngine {
/**
* Gets a data source for the given set of reads.
*
* @param argCollection arguments
* @param genomeLocParser parser
* @param refReader reader
* @return A data source for the given set of reads.
*/
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
DownsamplingMethod method = getDownsamplingMethod();
// Synchronize the method back into the collection so that it shows up when
// interrogating for the downsample method during command line recreation.
setDownsamplingMethod(method);
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
SAMDataSource dataSource = new SAMDataSource(
return new SAMDataSource(
samReaderIDs,
genomeLocParser,
argCollection.useOriginalBaseQualities,
@ -868,14 +765,12 @@ public class GenomeAnalysisEngine {
refReader,
argCollection.defaultBaseQualities,
!argCollection.disableLowMemorySharding);
return dataSource;
}
/**
* Opens a reference sequence file paired with an index. Only public for testing purposes
*
* @param refFile Handle to a reference sequence file. Non-null.
* @return A thread-safe file wrapper.
*/
public void setReferenceDataSource(File refFile) {
this.referenceDataSource = new ReferenceDataSource(refFile);
@ -929,6 +824,26 @@ public class GenomeAnalysisEngine {
return readsDataSource.getHeader(reader);
}
/**
* Returns an ordered list of the unmerged SAM file headers known to this engine.
* @return list of header for each input SAM file, in command line order
*/
public List<SAMFileHeader> getSAMFileHeaders() {
final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) {
headers.add(getReadsDataSource().getHeader(id));
}
return headers;
}
/**
* Gets the master sequence dictionary for this GATK engine instance
* @return a never-null dictionary listing all of the contigs known to this engine instance
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return getReferenceDataSource().getReference().getSequenceDictionary();
}
/**
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
@ -939,8 +854,6 @@ public class GenomeAnalysisEngine {
return this.readsDataSource;
}
/**
* Sets the collection of GATK main application arguments.
*
@ -1027,140 +940,14 @@ public class GenomeAnalysisEngine {
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
}
public SampleDataSource getSampleMetadata() {
return this.sampleDataSource;
}
// -------------------------------------------------------------------------------------
//
// code for working with Samples database
//
// -------------------------------------------------------------------------------------
/**
* Get a sample by its ID
* If an alias is passed in, return the main sample object
* @param id sample id
* @return sample Object with this ID
*/
public Sample getSampleById(String id) {
return sampleDataSource.getSampleById(id);
}
/**
* Get the sample for a given read group
* Must first look up ID for read group
* @param readGroup of sample
* @return sample object with ID from the read group
*/
public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
return sampleDataSource.getSampleByReadGroup(readGroup);
}
/**
* Get a sample for a given read
* Must first look up read group, and then sample ID for that read group
* @param read of sample
* @return sample object of this read
*/
public Sample getSampleByRead(SAMRecord read) {
return getSampleByReadGroup(read.getReadGroup());
}
/**
* Get number of sample objects
* @return size of samples map
*/
public int sampleCount() {
return sampleDataSource.sampleCount();
}
/**
* Return all samples with a given family ID
* Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
* @param familyId family ID
* @return Samples with the given family ID
*/
public Set<Sample> getFamily(String familyId) {
return sampleDataSource.getFamily(familyId);
}
/**
* Returns all children of a given sample
* See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
* @param sample parent sample
* @return children of the given sample
*/
public Set<Sample> getChildren(Sample sample) {
return sampleDataSource.getChildren(sample);
}
/**
* Gets all the samples
* @return
*/
public Collection<Sample> getSamples() {
return sampleDataSource.getSamples();
}
/**
* Takes a list of sample names and returns their corresponding sample objects
*
* @param sampleNameList List of sample names
* @return Corresponding set of samples
*/
public Set<Sample> getSamples(Collection<String> sampleNameList) {
return sampleDataSource.getSamples(sampleNameList);
}
/**
* Returns a set of samples that have any value (which could be null) for a given property
* @param key Property key
* @return Set of samples with the property
*/
public Set<Sample> getSamplesWithProperty(String key) {
return sampleDataSource.getSamplesWithProperty(key);
}
/**
* Returns a set of samples that have a property with a certain value
* Value must be a string for now - could add a similar method for matching any objects in the future
*
* @param key Property key
* @param value String property value
* @return Set of samples that match key and value
*/
public Set<Sample> getSamplesWithProperty(String key, String value) {
return sampleDataSource.getSamplesWithProperty(key, value);
}
/**
* Returns a set of sample objects for the sample names in a variant context
*
* @param context Any variant context
* @return a set of the sample objects
*/
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
Set<Sample> samples = new HashSet<Sample>();
for (String sampleName : context.getSampleNames()) {
samples.add(sampleDataSource.getOrCreateSample(sampleName));
}
return samples;
}
/**
* Returns all samples that were referenced in the SAM file
*/
public Set<Sample> getSAMFileSamples() {
return sampleDataSource.getSAMFileSamples();
}
/**
* Return a subcontext restricted to samples with a given property key/value
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
* @param context VariantContext to filter
* @param key property key
* @param value property value (must be string)
* @return subcontext
*/
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
return sampleDataSource.subContextFromSampleProperty(context, key, value);
public SampleDB getSampleDB() {
return this.sampleDB;
}
public Map<String,String> getApproximateCommandLineArguments(Object... argumentProviders) {

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
/**
* Holds a bunch of basic information about the traversal.
@ -102,8 +103,12 @@ public class ReadMetrics implements Cloneable {
counter.put(filter.getClass(), c + 1L);
}
public Map<Class,Long> getCountsByFilter() {
return Collections.unmodifiableMap(counter);
public Map<String,Long> getCountsByFilter() {
final TreeMap<String, Long> sortedCounts = new TreeMap<String, Long>();
for(Map.Entry<Class,Long> counterEntry: counter.entrySet()) {
sortedCounts.put(counterEntry.getKey().getSimpleName(),counterEntry.getValue());
}
return sortedCounts;
}
/**

View File

@ -29,13 +29,11 @@ package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.simpleframework.xml.*;
/**
* @author ebanks
* @version 1.0
*/
@Root
public class DbsnpArgumentCollection {
/**

View File

@ -26,34 +26,26 @@
package org.broadinstitute.sting.gatk.arguments;
import net.sf.samtools.SAMFileReader;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.simpleframework.xml.*;
import org.simpleframework.xml.core.Persister;
import org.simpleframework.xml.stream.Format;
import org.simpleframework.xml.stream.HyphenStyle;
import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* @author aaron
* @version 1.0
*/
@Root
public class GATKArgumentCollection {
/* our version number */
@ -64,58 +56,58 @@ public class GATKArgumentCollection {
public GATKArgumentCollection() {
}
@ElementMap(entry = "analysis_argument", key = "key", attribute = true, inline = true, required = false)
public Map<String, String> walkerArgs = new HashMap<String, String>();
// parameters and their defaults
@ElementList(required = false)
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
public List<String> samFiles = new ArrayList<String>();
// parameters and their defaults
@ElementList(required = false)
@Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false)
public List<File> sampleFiles = new ArrayList<File>();
@Element(required = false)
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
@Element(required = false)
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
@ElementList(required = false)
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required = false)
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List<String> readFilters = new ArrayList<String>();
@ElementList(required = false)
@Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.", required = false)
public List<String> intervals = null;
/**
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
*/
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> intervals = null;
@ElementList(required = false)
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "A list of genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file.", required = false)
public List<String> excludeIntervals = null;
/**
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
*/
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> excludeIntervals = null;
/**
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
*/
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
* Should abutting (but not overlapping) intervals be treated as separate intervals?
*/
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
@Element(required = false)
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
@Deprecated
@Hidden
@ElementList(required = false)
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :<name>,<type> <file>", required = false)
public ArrayList<String> RODBindings = new ArrayList<String>();
@Element(required = false)
@Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false)
public String RODToInterval = null;
@Element(required = false)
@Argument(fullName = "BTI_merge_rule", shortName = "BTIMR", doc = "Indicates the merging approach the interval parser should use to combine the BTI track with other -L options", required = false)
public IntervalSetRule BTIMergeRule = IntervalSetRule.UNION;
@Element(required = false)
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@ -128,22 +120,19 @@ public class GATKArgumentCollection {
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
@Element(required = false)
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here.", required = false)
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
public DownsampleType downsamplingType = null;
@Element(required = false)
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
public Double downsampleFraction = null;
@Element(required = false)
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
public Integer downsampleCoverage = null;
/**
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
* a default downsampling mechanism, return null.
* @return The explicitly specified downsampling mechanism, or null if none exists.
* a default downsampling mechanism, return the default.
* @return The explicitly specified downsampling mechanism, or the default if none exists.
*/
public DownsamplingMethod getDownsamplingMethod() {
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
@ -153,16 +142,26 @@ public class GATKArgumentCollection {
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
}
/**
* Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments.
* @param method The downsampling mechanism.
*/
public void setDownsamplingMethod(DownsamplingMethod method) {
if (method == null)
throw new IllegalArgumentException("method is null");
downsamplingType = method.type;
downsampleCoverage = method.toCoverage;
downsampleFraction = method.toFraction;
}
// --------------------------------------------------------------------------------------------------------------
//
// BAQ arguments
//
// --------------------------------------------------------------------------------------------------------------
@Element(required = false)
@Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false)
public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF;
@Element(required = false)
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
public double BAQGOP = BAQ.DEFAULT_GOP;
@ -171,7 +170,6 @@ public class GATKArgumentCollection {
// performance log arguments
//
// --------------------------------------------------------------------------------------------------------------
@Element(required = false)
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
@ -184,67 +182,117 @@ public class GATKArgumentCollection {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
}
@Element(required = false)
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;
@Element(required = false)
@Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false)
public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT;
@Element(required = false)
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
/** How many threads should be allocated to this analysis. */
@Element(required = false)
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
public int numberOfThreads = 1;
/** What rule should we use when merging intervals */
@Element(required = false)
@Argument(fullName = "interval_merging", shortName = "im", doc = "What interval merging rule should we use.", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
@ElementList(required = false)
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line", required = false)
public List<String> readGroupBlackList = null;
// --------------------------------------------------------------------------------------------------------------
//
// distributed GATK arguments
// PED (pedigree) support
//
// --------------------------------------------------------------------------------------------------------------
@Element(required=false)
@Argument(fullName="processingTracker",shortName="C",doc="A lockable, shared file for coordinating distributed GATK runs",required=false)
@Hidden
public File processingTrackerFile = null;
@Element(required=false)
@Argument(fullName="restartProcessingTracker",shortName="RPT",doc="Should we delete the processing tracker file at startup?",required=false)
@Hidden
public boolean restartProcessingTracker = false;
/**
* <p>Reads PED file-formatted tabular text files describing meta-data about the samples being
* processed in the GATK.</p>
*
* <ul>
* <li>see <a href="http://www.broadinstitute.org/mpg/tagger/faq.html">http://www.broadinstitute.org/mpg/tagger/faq.html</a></li>
* <li>see <a href="http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped">http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped</a></li>
* </ul>
*
* <p>The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:</p>
*
* <ul>
* <li>Family ID</li>
* <li>Individual ID</li>
* <li>Paternal ID</li>
* <li>Maternal ID</li>
* <li>Sex (1=male; 2=female; other=unknown)</li>
* <li>Phenotype</li>
* </ul>
*
* <p>The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person.
* A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a
* quantitative trait or an affection status column: GATK will automatically detect which type
* (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).</p>
*
* <p>If an individual's sex is unknown, then any character other than 1 or 2 can be used.</p>
*
* <p>You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that
* line will be ignored. Do not start any family IDs with this character therefore.</p>
*
* <p>Affection status should be coded:</p>
*
* <ul>
* <li>-9 missing</li>
* <li>0 missing</li>
* <li>1 unaffected</li>
* <li>2 affected</li>
* </ul>
*
* <p>If any value outside of -9,0,1,2 is detected than the samples are assumed
* to phenotype values are interpreted as string phenotype values. In this case -9 uniquely
* represents the missing value.</p>
*
* <p>Genotypes (column 7 onwards) cannot be specified to the GATK.</p>
*
* <p>For example, here are two individuals (one row = one person):</p>
*
* <pre>
* FAM001 1 0 0 1 2
* FAM001 2 0 0 1 2
* </pre>
*
* <p>Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to
* tell the GATK PED parser that the corresponding fields are missing from the ped file.</p>
*
* <p>Note that most GATK walkers do not use pedigree information. Walkers that require pedigree
* data should clearly indicate so in their arguments and will throw errors if required pedigree
* information is missing.</p>
*/
@Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false)
public List<File> pedigreeFiles = Collections.emptyList();
@Element(required=false)
@Argument(fullName="processingTrackerStatusFile",shortName="CSF",doc="If provided, a detailed accounting of the state of the process tracker is written to this file. For debugging, only",required=false)
@Hidden
public File processingTrackerStatusFile = null;
/**
* Inline PED records (see -ped argument). Each -pedString STRING can contain one or more
* valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString
* as -ped supports
*/
@Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false)
public List<String> pedigreeStrings = Collections.emptyList();
@Element(required=false)
@Argument(fullName="processingTrackerID",shortName="CID",doc="If provided, an integer ID (starting at 1) indicating a unique id for this process within the distributed GATK group",required=false)
@Hidden
public int processTrackerID = -1;
/**
* How strict should we be in parsing the PED files?
*/
@Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
// --------------------------------------------------------------------------------------------------------------
//
// BAM indexing and sharding arguments
//
// --------------------------------------------------------------------------------------------------------------
@Element(required = false)
@Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false)
@Hidden
public boolean allowIntervalsWithUnindexedBAM = false;
@Element(required = false)
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality.",required=false)
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
public boolean disableLowMemorySharding = false;
// --------------------------------------------------------------------------------------------------------------
@ -253,69 +301,6 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
/**
* marshal the data out to a object
*
* @param collection the GATKArgumentCollection to load into
* @param outputFile the file to write to
*/
public static void marshal(GATKArgumentCollection collection, String outputFile) {
Serializer serializer = new Persister(new Format(new HyphenStyle()));
File result = new File(outputFile);
try {
serializer.write(collection, result);
} catch (Exception e) {
throw new ReviewedStingException("Failed to marshal the data to the file " + outputFile, e);
}
}
/**
* marshal the data out to a object
*
* @param collection the GATKArgumentCollection to load into
* @param outputFile the stream to write to
*/
public static void marshal(GATKArgumentCollection collection, PrintStream outputFile) {
Serializer serializer = new Persister(new Format(new HyphenStyle()));
try {
serializer.write(collection, outputFile);
} catch (Exception e) {
throw new ReviewedStingException("Failed to marshal the data to the file " + outputFile, e);
}
}
/**
* unmashall the object from a configuration file
*
* @param filename the filename to marshal from
*/
public static GATKArgumentCollection unmarshal(String filename) {
Serializer serializer = new Persister(new Format(new HyphenStyle()));
File source = new File(filename);
try {
GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, source);
return example;
} catch (Exception e) {
throw new ReviewedStingException("Failed to marshal the data from file " + filename, e);
}
}
/**
* unmashall the object from a configuration file
*
* @param file the inputstream to marshal from
*/
public static GATKArgumentCollection unmarshal(InputStream file) {
Serializer serializer = new Persister(new Format(new HyphenStyle()));
try {
GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, file);
return example;
} catch (Exception e) {
throw new ReviewedStingException("Failed to marshal the data from file " + file.toString(), e);
}
}
/**
* test equality between two arg collections. This function defines the statement:
* "not fun to write"
@ -363,7 +348,7 @@ public class GATKArgumentCollection {
if (!other.referenceFile.equals(this.referenceFile)) {
return false;
}
if (!other.intervals.equals(this.intervals)) {
if ((other.intervals == null && this.intervals != null) || !other.intervals.equals(this.intervals)) {
return false;
}
if (!other.excludeIntervals.equals(this.excludeIntervals)) {
@ -386,39 +371,21 @@ public class GATKArgumentCollection {
if (other.intervalMerging != this.intervalMerging) {
return false;
}
if ((other.RODToInterval == null && RODToInterval != null) ||
(other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
return false;
}
if (other.phoneHomeType != this.phoneHomeType) {
return false;
}
if (BTIMergeRule != other.BTIMergeRule)
if (intervalSetRule != other.intervalSetRule)
return false;
if ( BAQMode != other.BAQMode) return false;
if ( BAQMode != other.BAQMode ) return false;
if ( BAQGOP != other.BAQGOP ) return false;
if ((other.performanceLog == null && this.performanceLog != null) ||
(other.performanceLog != null && !other.performanceLog.equals(this.performanceLog)))
return false;
if ((other.processingTrackerFile == null && this.processingTrackerFile != null) ||
(other.processingTrackerFile != null && !other.processingTrackerFile.equals(this.processingTrackerFile)))
return false;
if ((other.processingTrackerStatusFile == null && this.processingTrackerStatusFile != null) ||
(other.processingTrackerStatusFile != null && !other.processingTrackerStatusFile.equals(this.processingTrackerStatusFile)))
return false;
if ( restartProcessingTracker != other.restartProcessingTracker )
return false;
if ( processTrackerID != other.processTrackerID )
return false;
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false;

View File

@ -28,13 +28,11 @@ package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.simpleframework.xml.Root;
/**
* @author ebanks
* @version 1.0
*/
@Root
public class StandardVariantContextInputArgumentCollection {
/**

View File

@ -37,7 +37,6 @@ public class ValidationExclusion {
public enum TYPE {
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
ALLOW_EMPTY_INTERVAL_LIST, // allow the user to pass in an empty interval list
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities

View File

@ -25,12 +25,12 @@
package org.broadinstitute.sting.gatk.contexts;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.List;
@ -130,7 +130,7 @@ public class AlignmentContext implements HasGenomeLocation {
*/
@Deprecated
//todo: unsafe and tailored for current usage only; both pileups can be null or worse, bot can be not null in theory
public List<SAMRecord> getReads() { return ( basePileup.getReads() ); }
public List<GATKSAMRecord> getReads() { return ( basePileup.getReads() ); }
/**
* Are there any reads associated with this locus?
@ -138,7 +138,7 @@ public class AlignmentContext implements HasGenomeLocation {
* @return
*/
public boolean hasReads() {
return basePileup != null && basePileup.size() > 0 ;
return basePileup != null && basePileup.getNumberOfElements() > 0 ;
}
/**
@ -146,7 +146,7 @@ public class AlignmentContext implements HasGenomeLocation {
* @return
*/
public int size() {
return basePileup.size();
return basePileup.getNumberOfElements();
}
/**

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.contexts;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -76,14 +75,6 @@ public class AlignmentContextUtils {
return splitContextBySampleName(context, null);
}
public static Map<Sample, AlignmentContext> splitContextBySample(AlignmentContext context) {
Map<Sample, AlignmentContext> m = new HashMap<Sample, AlignmentContext>();
for ( Map.Entry<String, AlignmentContext> entry : splitContextBySampleName(context, null).entrySet() ) {
m.put(new Sample(entry.getKey()), entry.getValue());
}
return m;
}
/**
* Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead
* of sample object.
@ -97,11 +88,11 @@ public class AlignmentContextUtils {
GenomeLoc loc = context.getLocation();
HashMap<String, AlignmentContext> contexts = new HashMap<String, AlignmentContext>();
for(String sample: context.getPileup().getSampleNames()) {
ReadBackedPileup pileupBySample = context.getPileup().getPileupForSampleName(sample);
for(String sample: context.getPileup().getSamples()) {
ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample);
// Don't add empty pileups to the split context.
if(pileupBySample.size() == 0)
if(pileupBySample.getNumberOfElements() == 0)
continue;
if(sample != null)

View File

@ -1,10 +1,10 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.GenomeLocusIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Collections;
import java.util.List;
@ -132,7 +132,7 @@ public class AllLocusView extends LocusView {
* @param site Site at which to create the blank locus context.
* @return empty context.
*/
private final static List<SAMRecord> EMPTY_PILEUP_READS = Collections.emptyList();
private final static List<GATKSAMRecord> EMPTY_PILEUP_READS = Collections.emptyList();
private final static List<Integer> EMPTY_PILEUP_OFFSETS = Collections.emptyList();
private AlignmentContext createEmptyLocus( GenomeLoc site ) {
return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS));

View File

@ -59,7 +59,7 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
*/
public FilePointer next() {
FilePointer current = wrappedIterator.next();
while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
current = current.combine(parser,wrappedIterator.next());
return current;
}

View File

@ -43,6 +43,7 @@ import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File;
import java.lang.reflect.InvocationTargetException;
@ -57,6 +58,8 @@ import java.util.*;
* Converts shards to SAM iterators over the specified region
*/
public class SAMDataSource {
final private static GATKSamRecordFactory factory = new GATKSamRecordFactory();
/** Backing support for reads. */
protected final ReadProperties readProperties;
@ -235,6 +238,12 @@ public class SAMDataSource {
for(SAMFileReader reader: readers.values()) {
// Get the sort order, forcing it to coordinate if unsorted.
SAMFileHeader header = reader.getFileHeader();
if ( header.getReadGroups().isEmpty() ) {
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
}
SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate;
// Validate that all input files are sorted in the same order.
@ -638,7 +647,9 @@ public class SAMDataSource {
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
byte defaultBaseQualities) {
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
if ( useOriginalBaseQualities || defaultBaseQualities >= 0 )
// only wrap if we are replacing the original qualitiies or using a default base quality
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
@ -750,6 +761,7 @@ public class SAMDataSource {
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
for(SAMReaderID readerID: readerIDs) {
SAMFileReader reader = new SAMFileReader(readerID.samFile);
reader.setSAMRecordFactory(factory);
reader.enableFileSource(true);
reader.enableIndexMemoryMapping(false);
if(!enableLowMemorySharding)

View File

@ -97,7 +97,7 @@ public class FindLargeShards extends CommandLineProgram {
// intervals
GenomeLocSortedSet intervalSortedSet = null;
if(intervals != null)
intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals, true), IntervalMergingRule.ALL);
intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL);
else {
intervalSortedSet = new GenomeLocSortedSet(genomeLocParser);
for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences())

View File

@ -1,30 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.sample;
/**
* Created by IntelliJ IDEA.
* User: brett
* Date: Aug 12, 2010
* Time: 2:09:16 PM
*/
public class PropertyDefinition {
String property;
String[] values;
public String getProperty() {
return property;
}
public void setProperty(String property) {
this.property = property;
}
public String[] getValues() {
return values;
}
public void setValues(String[] values) {
this.values = values;
}
}

View File

@ -1,203 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.sample;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: brett
* Date: Jul 26, 2010
* Time: 3:31:38 PM
*/
public class Sample implements java.io.Serializable {
private final String id;
private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file
private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file
private HashMap<String, Object> properties = new HashMap<String, Object>();
private HashMap<String, Sample> relationships = new HashMap<String, Sample>();
public enum Gender {
MALE,
FEMALE,
UNKNOWN
}
public Sample(String id) {
/* if (id == null) {
throw new StingException("Error creating sample: sample ID cannot be null");
}*/
this.id = id;
}
public String getId() {
return this.id;
}
public Map<String, Object> getProperties() {
return properties;
}
public void setProperties(Map<String, Object> properties) {
this.properties = (HashMap) properties;
}
public Map<String,Sample> getRelationships() {
return Collections.unmodifiableMap(this.relationships);
}
public void setSampleFileEntry(boolean value) {
this.hasSampleFileEntry = value;
}
public boolean hasSAMFileEntry() {
return this.hasSAMFileEntry;
}
public void setSAMFileEntry(boolean value) {
this.hasSAMFileEntry = value;
}
public boolean hasSampleFileEntry() {
return this.hasSampleFileEntry;
}
/**
* Get one property
* @param key key of property
* @return value of property as generic object
*/
public Object getProperty(String key) {
return properties.get(key);
}
/**
* Set a property
* If property already exists, it is overwritten
* @param key key of property
* @param value object to be stored in properties array
*/
public void setProperty(String key, Object value) {
if (relationships.containsKey(key)) {
throw new StingException("The same key cannot exist as a property and a relationship");
}
if (key.equals("gender") && value.getClass() != Gender.class) {
throw new StingException("'gender' property must be of type Sample.Gender");
}
if (key.equals("population") && value.getClass() != String.class) {
throw new StingException("'population' property must be of type String");
}
properties.put(key, value);
}
/**
* Get one relationship
* @param key of relationship
* @return Sample object that this relationship points to
*/
public Sample getRelationship(String key) {
return relationships.get(key);
}
/**
* Set one relationship
* If already set, it is overwritten
* @param key key of the relationship
* @param value Sample object this relationship points to
*/
public void setRelationship(String key, Sample value) {
if (properties.containsKey(key)) {
throw new StingException("The same key cannot exist as a property and a relationship");
}
relationships.put(key, value);
}
/**
* Get the sample's mother
* @return sample object with relationship mother, if exists, or null
*/
public Sample getMother() {
return getRelationship("mother");
}
/**
* Get the sample's father
* @return sample object with relationship father, if exists, or null
*/
public Sample getFather() {
return getRelationship("father");
}
/**
* Get gender of the sample
* @return property of key "gender" - must be of type Gender
*/
public Gender getGender() {
return (Gender) properties.get("gender");
}
public String getPopulation() {
return (String) properties.get("population");
}
public String getFamilyId() {
return (String) properties.get("familyId");
}
/**
* @return True if sample is male, false if female, unknown, or null
*/
public boolean isMale() {
return properties.get("gender") == Gender.MALE;
}
/**
* @return True if sample is female, false if male, unknown or null
*/
public boolean isFemale() {
return properties.get("gender") == Gender.MALE;
}
/**
*
* @param key property key
* @return true if sample has this property (even if its value is null)
*/
public boolean hasProperty(String key) {
return properties.containsKey(key);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Sample sample = (Sample) o;
if (hasSAMFileEntry != sample.hasSAMFileEntry) return false;
if (hasSampleFileEntry != sample.hasSampleFileEntry) return false;
if (id != null ? !id.equals(sample.id) : sample.id != null) return false;
if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false;
if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null)
return false;
return true;
}
@Override
public int hashCode() {
return id != null ? id.hashCode() : "".hashCode();
}
}

View File

@ -1,31 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.sample;
/**
* Created by IntelliJ IDEA.
* User: brett
* Date: Aug 13, 2010
* Time: 5:13:46 PM
*/
public class SampleAlias {
String mainId;
String[] otherIds;
public String getMainId() {
return mainId;
}
public void setMainId(String mainId) {
this.mainId = mainId;
}
public String[] getOtherIds() {
return otherIds;
}
public void setOtherIds(String[] otherIds) {
this.otherIds = otherIds;
}
}

View File

@ -1,590 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.sample;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.yaml.snakeyaml.TypeDescription;
import org.yaml.snakeyaml.Yaml;
import org.yaml.snakeyaml.constructor.Constructor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: brett
* Date: Jul 26, 2010
* Time: 3:30:09 PM
*
* This class stores and manages sample metadata. This data is encoded in a sample file, which can be included
* in the GATK by the "--samples" argument. This class reads and parses those files.
*
* Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really
* only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker
* wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource.
*
*/
public class SampleDataSource {
/**
* SAMFileHeader that has been created for this analysis.
*/
private SAMFileHeader header;
/**
* This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so
* this is stored as a HashMap.
*/
private final HashMap<String, Sample> samples = new HashMap<String, Sample>();
/**
* Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different
* datasets. If this is the case, one ID is the "primary ID" and others are "aliases".
*
* This maps ID => primary ID for all samples ID strings - both primary IDs and aliases.
*/
private HashMap<String, String> sampleAliases = new HashMap<String, String>();
/**
* While loading sample files, we must be aware of "special" properties and relationships that are always allowed
*/
public static final String[] specialProperties = new String[] {"familyId", "population", "gender"};
public static final String[] specialRelationships = new String[] {"mother", "father"};
/**
* Constructor takes both a SAM header and sample files because the two must be integrated.
* @param header SAMFileHeader that has been created for this analysis
* @param sampleFiles Sample files that were included on the command line
*/
public SampleDataSource(SAMFileHeader header, List<File> sampleFiles) {
this();
this.header = header;
// create empty sample object for each sample referenced in the SAM header
for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
if (!hasSample(sampleName)) {
Sample newSample = new Sample(sampleName);
newSample.setSAMFileEntry(true);
samples.put(sampleName, newSample);
}
}
// add files consecutively
if (sampleFiles != null) {
for (File file : sampleFiles) {
addFile(file);
}
}
}
public SampleDataSource() {
samples.put(null, new Sample(null));
}
/**
* Hallucinates sample objects for all the samples in the SAM file and stores them
*/
public void addSamplesFromSAMHeader(SAMFileHeader header) {
for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
if (!hasSample(sampleName)) {
Sample newSample = new Sample(sampleName);
newSample.setSAMFileEntry(true);
samples.put(sampleName, newSample);
}
}
}
/**
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
public void addFile(File sampleFile) {
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(sampleFile));
}
catch (IOException e) {
throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e);
}
// set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file
Constructor con = new Constructor(SampleFileParser.class);
TypeDescription desc = new TypeDescription(SampleFileParser.class);
desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class);
desc.putListPropertyType("sampleAliases", SampleAlias.class);
con.addTypeDescription(desc);
Yaml yaml = new Yaml(con);
// SampleFileParser stores an object representation of a sample file - this is what we'll parse
SampleFileParser parser;
try {
parser = (SampleFileParser) yaml.load(reader);
}
catch (Exception e) {
throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e);
}
// check to see which validation options were built into the file
boolean restrictProperties = parser.getAllowedProperties() != null;
boolean restrictRelationships = parser.getAllowedRelationships() != null;
boolean restrictPropertyValues = parser.getPropertyDefinitions() != null;
// propertyValues stores the values that are allowed for a given property
HashMap<String, HashSet> propertyValues = null;
if (restrictPropertyValues) {
propertyValues = new HashMap<String, HashSet>();
for (PropertyDefinition def : parser.getPropertyDefinitions()) {
HashSet<String> set = new HashSet<String>();
for (String value : def.getValues()) {
set.add(value);
}
propertyValues.put(def.getProperty(), set);
}
}
// make sure the aliases are valid
validateAliases(parser);
// loop through each sample in the file - a SampleParser stores an object that will become a Sample
for (SampleParser sampleParser : parser.getSamples()) {
try {
// step 1: add the sample if it doesn't already exist
Sample sample = getSampleById(sampleParser.getId());
if (sample == null) {
sample = new Sample(sampleParser.getId());
}
addSample(sample);
sample.setSampleFileEntry(true);
// step 2: add the properties
if (sampleParser.getProperties() != null) {
for (String property : sampleParser.getProperties().keySet()) {
// check that property is allowed
if (restrictProperties) {
if (!isPropertyValid(property, parser.getAllowedProperties())) {
throw new StingException(property + " is an invalid property. It is not included in the list " +
"of allowed properties.");
}
}
// next check that the value is allowed
if (restrictPropertyValues) {
if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) {
throw new StingException("The value of property '" + property + "' is invalid. " +
"It is not included in the list of allowed values for this property.");
}
}
// next check that there isn't already a conflicting property there
if (sample.getProperty(property) != null &&
sample.getProperty(property) != sampleParser.getProperties().get(property))
{
throw new StingException(property + " is a conflicting property!");
}
// checks are passed - now add the property!
saveProperty(sample, property, sampleParser.getProperties().get(property));
}
}
// step 3: add the relationships
if (sampleParser.getRelationships() != null) {
for (String relationship : sampleParser.getRelationships().keySet()) {
String relativeId = sampleParser.getRelationships().get(relationship);
if (relativeId == null) {
throw new StingException("The relationship cannot be null");
}
// first check that it's not invalid
if (restrictRelationships) {
if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) {
throw new StingException(relationship + " is an invalid relationship");
}
}
// next check that there isn't already a conflicting property there
if (sample.getRelationship(relationship) != null) {
if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) {
throw new StingException(relationship + " is a conflicting relationship!");
}
// if the relationship is already set - and consistent with what we're reading now - no need to continue
else {
continue;
}
}
// checks are passed - now save the relationship
saveRelationship(sample, relationship, relativeId);
}
}
} catch (Exception e) {
throw new StingException("An error occurred while loading this sample from the sample file: " +
sampleParser.getId(), e);
}
}
}
private boolean isValueAllowed(String key, Object value, HashMap<String, HashSet> valuesList) {
// if the property values weren't specified for this property, then any value is okay
if (!valuesList.containsKey(key)) {
return true;
}
// if this property has enumerated values, it must be a string
else if (value.getClass() != String.class)
return false;
// is the value specified or not?
else if (!valuesList.get(key).contains(value))
return false;
return true;
}
/**
* Makes sure that the aliases are valid
* Checks that 1) no string is used as both a main ID and an alias;
* 2) no alias is used more than once
* @param parser
*/
private void validateAliases(SampleFileParser parser) {
// no aliases sure validate
if (parser.getSampleAliases() == null)
return;
HashSet<String> mainIds = new HashSet<String>();
HashSet<String> otherIds = new HashSet<String>();
for (SampleAlias sampleAlias : parser.getSampleAliases()) {
mainIds.add(sampleAlias.getMainId());
for (String otherId : sampleAlias.getOtherIds()) {
if (mainIds.contains(otherId))
throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " +
"be both a main ID and an other ID", otherId));
if (!otherIds.add(otherId))
throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " +
"alias more than once.", otherId));
}
}
}
private boolean isPropertyValid(String property, String[] allowedProperties) {
// is it a special property that is always allowed?
for (String allowedProperty : specialProperties) {
if (property.equals(allowedProperty))
return true;
}
// is it in the allowed properties list?
for (String allowedProperty : allowedProperties) {
if (property.equals(allowedProperty))
return true;
}
return false;
}
private boolean isRelationshipValid(String relationship, String[] allowedRelationships) {
// is it a special relationship that is always allowed?
for (String allowedRelationship : specialRelationships) {
if (relationship.equals(allowedRelationship))
return true;
}
// is it in the allowed properties list?
for (String allowedRelationship : allowedRelationships) {
if (relationship.equals(allowedRelationship))
return true;
}
return false;
}
/**
* Saves a property as the correct type
* @param key property key
* @param value property value, as read from YAML parser
* @return property value to be stored
*/
private void saveProperty(Sample sample, String key, Object value) {
// convert gender to the right type, if it was stored as a String
if (key.equals("gender")) {
if (((String) value).toLowerCase().equals("male")) {
value = Sample.Gender.MALE;
}
else if (((String) value).toLowerCase().equals("female")) {
value = Sample.Gender.FEMALE;
}
else if (((String) value).toLowerCase().equals("unknown")) {
value = Sample.Gender.UNKNOWN;
}
else if (value != null) {
throw new StingException("'gender' property must be male, female, or unknown.");
}
}
try {
sample.setProperty(key, value);
}
catch (Exception e) {
throw new StingException("Could not save property " + key, e);
}
}
/**
* Saves a relationship as the correct type
* @param key relationship key
* @param relativeId sample ID string of the relative
* @return relationship value to be stored
*/
private void saveRelationship(Sample sample, String key, String relativeId) {
// get the reference that we'll store as the value
Sample relative = getSampleById(relativeId);
// create sample object for the relative, if necessary
if (relative == null) {
relative = new Sample(relativeId);
addSample(relative);
}
sample.setRelationship(key, relative);
}
/**
* Filter a sample name in case it is an alias
* @param sampleId to be filtered
* @return ID of sample that stores data for this alias
*/
private String aliasFilter(String sampleId) {
if (!sampleAliases.containsKey(sampleId))
return sampleId;
else
return sampleAliases.get(sampleId);
}
/**
* Add a sample to the collection
* @param sample to be added
*/
private void addSample(Sample sample) {
samples.put(sample.getId(), sample);
}
/**
* Check if sample with this ID exists
* Note that this will return true if name passed in is an alias
* @param id ID of sample to be checked
* @return true if sample exists; false if not
*/
public boolean hasSample(String id) {
return samples.get(aliasFilter(id)) != null;
}
/**
* Get a sample by its ID
* If an alias is passed in, return the main sample object
* @param id
* @return sample Object with this ID
*/
public Sample getSampleById(String id) {
return samples.get(aliasFilter(id));
}
/**
* Get the sample for a given read group
* Must first look up ID for read group
* @param readGroup of sample
* @return sample object with ID from the read group
*/
public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
String nameFromReadGroup = readGroup.getSample();
return getSampleById(nameFromReadGroup);
}
/**
* Get a sample for a given read
* Must first look up read group, and then sample ID for that read group
* @param read of sample
* @return sample object of this read
*/
public Sample getSampleByRead(SAMRecord read) {
return getSampleByReadGroup(read.getReadGroup());
}
/**
* Get number of sample objects
* @return size of samples map
*/
public int sampleCount() {
return samples.size();
}
/**
* Return all samples with a given family ID
* Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
* @param familyId
* @return
*/
public Set<Sample> getFamily(String familyId) {
HashSet<Sample> familyMembers = new HashSet<Sample>();
for (Sample sample : samples.values()) {
if (sample.getFamilyId() != null) {
if (sample.getFamilyId().equals(familyId))
familyMembers.add(sample);
}
}
return familyMembers;
}
/**
* Returns all children of a given sample
* See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
* @param sample
* @return
*/
public Set<Sample> getChildren(Sample sample) {
HashSet<Sample> children = new HashSet<Sample>();
for (Sample familyMember : getFamily(sample.getFamilyId())) {
if (familyMember.getMother() == sample || familyMember.getFather() == sample) {
children.add(familyMember);
}
}
return children;
}
public Set<Sample> getSamples() {
HashSet<Sample> set = new HashSet<Sample>();
set.addAll(samples.values());
return set;
}
/**
* Takes a collection of sample names and returns their corresponding sample objects
* Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set
* @param sampleNameList Set of sample names
* @return Corresponding set of samples
*/
public Set<Sample> getSamples(Collection<String> sampleNameList) {
HashSet<Sample> samples = new HashSet<Sample>();
for (String name : sampleNameList) {
try {
samples.add(getSampleById(name));
}
catch (Exception e) {
throw new StingException("Could not get sample with the following ID: " + name, e);
}
}
return samples;
}
/**
* Returns a set of samples that have any value (which could be null) for a given property
* @param key Property key
* @return Set of samples with the property
*/
public Set<Sample> getSamplesWithProperty(String key) {
HashSet<Sample> toReturn = new HashSet<Sample>();
for (Sample s : samples.values()) {
if (s.hasProperty(key))
toReturn.add(s);
}
return toReturn;
}
/**
* Returns a set of samples that have a property with a certain value
* Value must be a string for now - could add a similar method for matching any objects in the future
*
* @param key Property key
* @param value String property value
* @return Set of samples that match key and value
*/
public Set<Sample> getSamplesWithProperty(String key, String value) {
Set<Sample> toReturn = getSamplesWithProperty(key);
for (Sample s : toReturn) {
if (!s.getProperty(key).equals(value))
toReturn.remove(s);
}
return toReturn;
}
public Sample getOrCreateSample(String id) {
Sample sample = getSampleById(id);
if (sample == null) {
sample = new Sample(id);
addSample(sample);
}
return sample;
}
/**
* Returns all samples that were referenced in the SAM file
*/
public Set<Sample> getSAMFileSamples() {
Set<Sample> toReturn = new HashSet<Sample>();
for (Sample sample : samples.values()) {
if (sample.hasSAMFileEntry())
toReturn.add(sample);
}
return toReturn;
}
/**
* Returns a set of sample objects for the sample names in a variant context
*
* @param context Any variant context
* @return a set of the sample objects
*/
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
Set<Sample> samples = new HashSet<Sample>();
for (String sampleName : context.getSampleNames()) {
samples.add(getOrCreateSample(sampleName));
}
return samples;
}
/**
* Return a subcontext restricted to samples with a given property key/value
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
* @param context VariantContext to filter
* @param key property key
* @param value property value (must be string)
* @return subcontext
*/
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
Set<String> samplesWithProperty = new HashSet<String>();
for (String sampleName : context.getSampleNames()) {
Sample s = samples.get(sampleName);
if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value))
samplesWithProperty.add(sampleName);
}
Map<String, Genotype> genotypes = context.getGenotypes(samplesWithProperty);
return context.subContextFromGenotypes(genotypes.values());
}
public static SampleDataSource createEmptyDataSource() {
SAMFileHeader header = new SAMFileHeader();
return new SampleDataSource(header, null);
}
}

View File

@ -1,65 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.sample;
/**
* Created by IntelliJ IDEA.
* User: brett
* Date: Aug 12, 2010
* Time: 1:30:44 PM
*/
public class SampleFileParser {
private SampleAlias[] sampleAliases;
private String[] allowedProperties;
private String[] allowedRelationships;
private PropertyDefinition[] propertyDefinitions;
private SampleParser[] samples;
public PropertyDefinition[] getPropertyDefinitions() {
return propertyDefinitions;
}
public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) {
this.propertyDefinitions = propertyDefinitions;
}
public SampleFileParser() {
}
public String[] getAllowedProperties() {
return allowedProperties;
}
public void setAllowedProperties(String[] allowedProperties) {
this.allowedProperties = allowedProperties;
}
public SampleParser[] getSamples() {
return samples;
}
public void setSamples(SampleParser[] samples) {
this.samples = samples;
}
public String[] getAllowedRelationships() {
return allowedRelationships;
}
public void setAllowedRelationships(String[] allowedRelationships) {
this.allowedRelationships = allowedRelationships;
}
public SampleAlias[] getSampleAliases() {
return sampleAliases;
}
public void setSampleAliases(SampleAlias[] sampleAliases) {
this.sampleAliases = sampleAliases;
}
}

View File

@ -1,43 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.sample;
import java.util.HashMap;
/**
* Created by IntelliJ IDEA.
* User: brett
* Date: Aug 13, 2010
* Time: 2:09:43 PM
*/
public class SampleParser {
private String id;
private HashMap<String, Object> properties;
private HashMap<String, String> relationships;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public HashMap<String, Object> getProperties() {
return properties;
}
public void setProperties(HashMap<String, Object> properties) {
this.properties = properties;
}
public HashMap<String, String> getRelationships() {
return relationships;
}
public void setRelationships(HashMap<String, String> relationships) {
this.relationships = relationships;
}
}

View File

@ -85,12 +85,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
*/
protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse ) {
super(engine, walker, reads, reference, rods);
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
if (engine.getArguments().processingTrackerFile != null) {
throw new UserException.BadArgumentValue("-C", "Distributed GATK calculations currently not supported in multi-threaded mode. Complain to Mark depristo@broadinstitute.org to implement and test this code path");
}
}
public Object execute( Walker walker, ShardStrategy shardStrategy ) {

View File

@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.SampleUtils;
import java.util.Collection;
@ -56,7 +57,8 @@ public class LinearMicroScheduler extends MicroScheduler {
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
LocusWalker lWalker = (LocusWalker)walker;
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata());
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods);
Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit());

View File

@ -62,16 +62,17 @@ public class ShardTraverser implements Callable {
Object accumulator = walker.reduceInit();
LocusWalker lWalker = (LocusWalker)walker;
WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleMetadata()); // todo: microScheduler.engine is protected - is it okay to user it here?
ShardDataProvider dataProvider = null;
WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),
microScheduler.getReadIterator(shard),
shard.getGenomeLocs(),
microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here?
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods);
final ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods);
accumulator = traversalEngine.traverse( walker, dataProvider, accumulator );
dataProvider.close();
}
if (dataProvider != null) dataProvider.close();
windowMaker.close();
outputMergeTask = outputTracker.closeStorage();

View File

@ -4,7 +4,6 @@ import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
@ -12,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
@ -63,17 +63,20 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
* the given intervals.
* @param iterator The data source for this window.
* @param intervals The set of intervals over which to traverse.
* @param sampleData SampleDataSource that we can reference reads with
* @param sampleNames The complete set of sample names in the reads in shard
*/
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, SampleDataSource sampleData ) {
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, Collection<String> sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
this.sourceIterator = new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData));
this.sourceIterator = new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
}
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals ) {
this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
}
public Iterator<WindowMakerIterator> iterator() {
return this;
}

View File

@ -27,7 +27,9 @@ package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Filter out malformed reads.
@ -37,14 +39,25 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
*/
public class MalformedReadFilter extends ReadFilter {
private SAMFileHeader header;
@Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false)
boolean filterMismatchingBaseAndQuals = false;
@Override
public void initialize(GenomeAnalysisEngine engine) {
this.header = engine.getSAMFileHeader();
}
public boolean filterOut(SAMRecord read) {
return !checkInvalidAlignmentStart(read) ||
// slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided
if (!checkMismatchingBasesAndQuals(read)) {
if (!filterMismatchingBaseAndQuals)
throw new UserException.MalformedBAM(read, "BAM file has a read with mismatching number of bases and base qualities. Offender: " + read.getReadName() +" [" + read.getReadLength() + " bases] [" +read.getBaseQualities().length +"] quals");
else
return true;
}
return !checkInvalidAlignmentStart(read) ||
!checkInvalidAlignmentEnd(read) ||
!checkAlignmentDisagreesWithHeader(this.header,read) ||
!checkCigarDisagreesWithAlignment(read);
@ -108,4 +121,13 @@ public class MalformedReadFilter extends ReadFilter {
return false;
return true;
}
/**
* Check if the read has the same number of bases and base qualities
* @param read the read to validate
* @return true if they have the same number. False otherwise.
*/
private static boolean checkMismatchingBasesAndQuals(SAMRecord read) {
return (read.getReadLength() == read.getBaseQualities().length);
}
}

View File

@ -0,0 +1,23 @@
package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Argument;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 9/19/11
* Time: 4:09 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadNameFilter extends ReadFilter {
@Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true)
private String readName;
public boolean filterOut(final SAMRecord rec) {
return ! rec.getReadName().equals(readName);
}
}

View File

@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
else if ( stub.getOutputStream() != null ) {
this.file = null;
this.stream = stub.getOutputStream();
writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
}
else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
}
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.io.stubs;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
return isCompressed;
}
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return engine.getMasterSequenceDictionary();
}
/**
* Should we tell the VCF writer not to write genotypes?
* @return true if the writer should not write genotypes.

View File

@ -35,26 +35,23 @@ import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.ReservoirDownsampler;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */
public class LocusIteratorByState extends LocusIterator {
// private static long discarded_bases = 0L;
// private static long observed_bases = 0L;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator {
* Used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
private final ArrayList<Sample> samples;
private final ArrayList<String> samples;
private final ReadStateManager readStates;
static private class SAMRecordState {
@ -278,15 +275,27 @@ public class LocusIteratorByState extends LocusIterator {
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, SampleDataSource sampleData ) {
public LocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples ) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
this.samples = new ArrayList<String>(samples);
this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
// get the list of samples
this.samples = new ArrayList<Sample>(sampleData.getSamples());
readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
if ( this.samples.isEmpty() && samIterator.hasNext() ) {
throw new IllegalArgumentException("samples list must not be empty");
}
}
/**
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
* for the system.
*/
public final static Collection<String> sampleListForSAMWithoutReadGroups() {
List<String> samples = new ArrayList<String>();
samples.add(null);
return samples;
}
public Iterator<AlignmentContext> iterator() {
@ -303,19 +312,6 @@ public class LocusIteratorByState extends LocusIterator {
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
}
public void printState() {
for(Sample sample: samples) {
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
while(iterator.hasNext()) {
SAMRecordState state = iterator.next();
logger.debug(String.format("printState():"));
SAMRecord read = state.getRead();
int offset = state.getReadOffset();
logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString()));
}
}
}
private GenomeLoc getLocation() {
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
}
@ -355,14 +351,14 @@ public class LocusIteratorByState extends LocusIterator {
// In this case, the subsequent call to next() will emit the normal pileup at the current base
// and shift the position.
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
Map<Sample,ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<Sample,ReadBackedExtendedEventPileupImpl>();
Map<String,ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<String,ReadBackedExtendedEventPileupImpl>();
// get current location on the reference and decrement it by 1: the indels we just stepped over
// are associated with the *previous* reference base
GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1);
boolean hasBeenSampled = false;
for(Sample sample: samples) {
for(final String sample: samples) {
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
List<ExtendedEventPileupElement> indelPile = new ArrayList<ExtendedEventPileupElement>(readStates.size(sample));
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
@ -382,10 +378,7 @@ public class LocusIteratorByState extends LocusIterator {
maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength());
}
else nInsertions++;
indelPile.add ( new ExtendedEventPileupElement(state.getRead(),
state.getReadEventStartOffset(),
state.getEventLength(),
state.getEventBases()) );
indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadEventStartOffset(), state.getEventLength(), state.getEventBases()) );
} else {
// HACK: The readahead mechanism for LocusIteratorByState will effectively read past the current position
@ -407,9 +400,7 @@ public class LocusIteratorByState extends LocusIterator {
// we count such reads (with a longer deletion spanning over a deletion at the previous base we are
// about to report) only if includeReadsWithDeletionAtLoci is true.
size++;
indelPile.add ( new ExtendedEventPileupElement(state.getRead(),
state.getReadOffset()-1,
-1) // length=-1 --> noevent
indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()-1, -1) // length=-1 --> noevent
);
}
}
@ -426,10 +417,10 @@ public class LocusIteratorByState extends LocusIterator {
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
} else {
GenomeLoc location = getLocation();
Map<Sample,ReadBackedPileupImpl> fullPileup = new HashMap<Sample,ReadBackedPileupImpl>();
Map<String,ReadBackedPileupImpl> fullPileup = new HashMap<String,ReadBackedPileupImpl>();
boolean hasBeenSampled = false;
for(Sample sample: samples) {
for(final String sample: samples) {
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
@ -447,12 +438,12 @@ public class LocusIteratorByState extends LocusIterator {
continue;
} else {
//observed_bases++;
pile.add(new PileupElement(state.getRead(), state.getReadOffset()));
pile.add(new PileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()));
size++;
}
} else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) {
size++;
pile.add(new PileupElement(state.getRead(), -1));
pile.add(new PileupElement((GATKSAMRecord) state.getRead(), -1));
nDeletions++;
}
@ -495,7 +486,7 @@ public class LocusIteratorByState extends LocusIterator {
}
private void updateReadStates() {
for(Sample sample: samples) {
for(final String sample: samples) {
Iterator<SAMRecordState> it = readStates.iterator(sample);
while ( it.hasNext() ) {
SAMRecordState state = it.next();
@ -522,7 +513,7 @@ public class LocusIteratorByState extends LocusIterator {
private final PeekableIterator<SAMRecord> iterator;
private final DownsamplingMethod downsamplingMethod;
private final SamplePartitioner samplePartitioner;
private final Map<Sample,PerSampleReadStateManager> readStatesBySample = new HashMap<Sample,PerSampleReadStateManager>();
private final Map<String,PerSampleReadStateManager> readStatesBySample = new HashMap<String,PerSampleReadStateManager>();
private final int targetCoverage;
private int totalReadStates = 0;
@ -540,9 +531,9 @@ public class LocusIteratorByState extends LocusIterator {
}
Map<String,ReadSelector> readSelectors = new HashMap<String,ReadSelector>();
for(Sample sample: samples) {
for(final String sample: samples) {
readStatesBySample.put(sample,new PerSampleReadStateManager());
readSelectors.put(sample.getId(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector());
readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector());
}
samplePartitioner = new SamplePartitioner(readSelectors);
@ -554,7 +545,7 @@ public class LocusIteratorByState extends LocusIterator {
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
public Iterator<SAMRecordState> iterator(final Sample sample) {
public Iterator<SAMRecordState> iterator(final String sample) {
return new Iterator<SAMRecordState>() {
private Iterator<SAMRecordState> wrappedIterator = readStatesBySample.get(sample).iterator();
@ -590,7 +581,7 @@ public class LocusIteratorByState extends LocusIterator {
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
public int size(final Sample sample) {
public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
@ -600,12 +591,12 @@ public class LocusIteratorByState extends LocusIterator {
* @param sample Sample, downsampled independently.
* @return Integer stop of the furthest undownsampled region.
*/
public int getDownsamplingExtent(final Sample sample) {
public int getDownsamplingExtent(final String sample) {
return readStatesBySample.get(sample).getDownsamplingExtent();
}
public SAMRecordState getFirst() {
for(Sample sample: samples) {
for(final String sample: samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if(!reads.isEmpty())
return reads.peek();
@ -639,8 +630,8 @@ public class LocusIteratorByState extends LocusIterator {
}
samplePartitioner.complete();
for(Sample sample: samples) {
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getId());
for(final String sample: samples) {
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
Collection<SAMRecord> newReads = new ArrayList<SAMRecord>(aggregator.getSelectedReads());
@ -1072,6 +1063,3 @@ class SamplePartitioner implements ReadSelector {
}
}

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* An iterator which does post-processing of a read, including potentially wrapping
@ -78,7 +77,30 @@ public class ReadFormattingIterator implements StingSAMIterator {
* no next exists.
*/
public SAMRecord next() {
return new GATKSAMRecord(wrappedIterator.next(), useOriginalBaseQualities, defaultBaseQualities);
SAMRecord rec = wrappedIterator.next();
// if we are using default quals, check if we need them, and add if necessary.
// 1. we need if reads are lacking or have incomplete quality scores
// 2. we add if defaultBaseQualities has a positive value
if (defaultBaseQualities >= 0) {
byte reads [] = rec.getReadBases();
byte quals [] = rec.getBaseQualities();
if (quals == null || quals.length < reads.length) {
byte new_quals [] = new byte [reads.length];
for (int i=0; i<reads.length; i++)
new_quals[i] = defaultBaseQualities;
rec.setBaseQualities(new_quals);
}
}
// if we are using original quals, set them now if they are present in the record
if ( useOriginalBaseQualities ) {
byte[] originalQuals = rec.getOriginalBaseQualities();
if ( originalQuals != null )
rec.setBaseQualities(originalQuals);
}
return rec;
}
/**

View File

@ -28,7 +28,6 @@ import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -101,9 +100,6 @@ public class GATKRunReport {
@Element(required = false, name = "exception")
private final ExceptionToXML mException;
@Element(required = false, name = "argument_collection")
private final GATKArgumentCollection mCollection;
@Element(required = true, name = "working_directory")
private String currentPath;
@ -187,7 +183,6 @@ public class GATKRunReport {
cmdLine = engine.createApproximateCommandLineArgumentString(engine, walker);
} catch (Exception ignore) { }
this.mCollection = engine.getArguments();
walkerName = engine.getWalkerName(walker.getClass());
svnVersion = CommandLineGATK.getVersionNumber();
@ -293,15 +288,16 @@ public class GATKRunReport {
* That is, postReport() is guarenteed not to fail for any reason.
*/
private File postReportToLocalDisk(File rootDir) {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
try {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
postReportToFile(file);
logger.debug("Wrote report to " + file);
return file;
} catch ( Exception e ) {
// we catch everything, and no matter what eat the error
exceptDuringRunReport("Couldn't read report file", e);
file.delete();
return null;
}
}
@ -312,6 +308,7 @@ public class GATKRunReport {
File localFile = postReportToLocalDisk(new File("./"));
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
if ( localFile != null ) { // we succeeded in creating the local file
localFile.deleteOnExit();
try {
// stop us from printing the annoying, and meaningless, mime types warning
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
@ -336,14 +333,13 @@ public class GATKRunReport {
//logger.info("Uploading " + localFile + " to AWS bucket");
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
logger.debug("Uploaded to AWS: " + s3Object);
logger.info("Uploaded run statistics report to AWS S3");
} catch ( S3ServiceException e ) {
exceptDuringRunReport("S3 exception occurred", e);
} catch ( NoSuchAlgorithmException e ) {
exceptDuringRunReport("Couldn't calculate MD5", e);
} catch ( IOException e ) {
exceptDuringRunReport("Couldn't read report file", e);
} finally {
localFile.delete();
}
}
}

View File

@ -1,130 +0,0 @@
package org.broadinstitute.sting.gatk.refdata.indexer;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File;
import java.io.FileOutputStream;
/**
* a utility class that can create an index, written to a target location. This is useful when you're unable to write to the directory
* in which an index is located, or if you'd like to pre-index files to save time.
*/
public class RMDIndexer extends CommandLineProgram {
@Argument(shortName="in", fullName="inputFile", doc="The reference meta data file to index", required = true)
File inputFileSource = null;
@Argument(shortName="t", fullName="type", doc="The reference meta data file format (e.g. vcf, bed)", required = true)
String inputFileType = null;
@Input(fullName = "referenceSequence", shortName = "R", doc = "The reference to use when indexing; this sequence will be set in the index", required = true)
public File referenceFile = null;
@Input(shortName = "i", fullName = "indexFile", doc = "Where to write the index to (as a file), if not supplied we write to <inputFile>.idx", required = false)
public File indexFile = null;
@Argument(shortName = "ba", fullName = "balanceApproach", doc="the index balancing approach to take", required=false)
IndexFactory.IndexBalanceApproach approach = IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME;
private static Logger logger = Logger.getLogger(RMDIndexer.class);
private IndexedFastaSequenceFile ref = null;
private GenomeLocParser genomeLocParser = null;
@Override
protected int execute() throws Exception {
// check parameters
// ---------------------------------------------------------------------------------
// check the input parameters
if (referenceFile != null && !referenceFile.canRead())
throw new IllegalArgumentException("We can't read the reference file: "
+ referenceFile + ", check that it exists, and that you have permissions to read it");
// set the index file to the default name if they didn't specify a file
if (indexFile == null && inputFileSource != null)
indexFile = new File(inputFileSource.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION);
// check that we can create the output file
if (indexFile == null || indexFile.exists())
throw new IllegalArgumentException("We can't write to the index file location: "
+ indexFile + ", the index exists");
logger.info(String.format("attempting to index file: %s", inputFileSource));
logger.info(String.format("using reference: %s", ((referenceFile != null) ? referenceFile.getAbsolutePath() : "(not supplied)")));
logger.info(String.format("using type: %s", inputFileType));
logger.info(String.format("writing to location: %s", indexFile.getAbsolutePath()));
// try to index the file
// ---------------------------------------------------------------------------------
// setup the reference
ref = new CachingIndexedFastaSequenceFile(referenceFile);
genomeLocParser = new GenomeLocParser(ref);
// get a track builder
RMDTrackBuilder builder = new RMDTrackBuilder(ref.getSequenceDictionary(),genomeLocParser, ValidationExclusion.TYPE.ALL);
// find the types available to the track builders
FeatureManager.FeatureDescriptor descriptor = builder.getFeatureManager().getByName(inputFileType);
// check that the type is valid
if (descriptor == null)
throw new IllegalArgumentException("The type specified " + inputFileType + " is not a valid type. Valid type list: " + builder.getFeatureManager().userFriendlyListOfAvailableFeatures());
// create the codec
FeatureCodec codec = builder.getFeatureManager().createCodec(descriptor, "foo", genomeLocParser);
// check if it's a reference dependent feature codec
if (codec instanceof ReferenceDependentFeatureCodec)
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser);
// get some timing info
long currentTime = System.currentTimeMillis();
Index index = IndexFactory.createIndex(inputFileSource, codec, approach);
// add writing of the sequence dictionary, if supplied
builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false);
// create the output stream, and write the index
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
index.write(stream);
stream.close();
// report and exit
logger.info("Successfully wrote the index to location: " + indexFile + " in " + ((System.currentTimeMillis() - currentTime)/1000) + " seconds");
return 0; // return successfully
}
/**
* the generic call execute main
* @param argv the arguments from the command line
*/
public static void main(String[] argv) {
try {
RMDIndexer instance = new RMDIndexer();
start(instance, argv);
System.exit(CommandLineProgram.result);
} catch (Exception e) {
exitSystemWithError(e);
}
}
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.index.Index;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* Utilities for working with Sequence Dictionaries embedded in tribble indices
*
* @author Your Name
* @since Date created
*/
public class IndexDictionaryUtils {
private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class);
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
for ( SAMSequenceRecord seq : dict.getSequences() ) {
final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName();
final String length = String.valueOf(seq.getSequenceLength());
index.addProperty(contig,length);
}
}
public static void validateTrackSequenceDictionary(final String trackName,
final SAMSequenceDictionary trackDict,
final SAMSequenceDictionary referenceDict,
final ValidationExclusion.TYPE validationExclusionType ) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0)
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else {
Set<String> trackSequences = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
trackSequences.add(dictionaryEntry.getSequenceName());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
}
}
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureSource;
@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
*
* @author aaron
*
* @author aaron
* `
* Class RMDTrackBuilder
*
@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false;
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
// private sequence dictionary we use to set our tracks with
private SAMSequenceDictionary dict = null;
@ -150,7 +140,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass);
if (descriptor == null)
throw new ReviewedStingException("Unable to find type name for codex class " + codecClass.getName());
throw new ReviewedStingException("Unable to find type name for codec class " + codecClass.getName());
return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags()));
}
@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); }
catch (ReviewedStingException e) { }
sequenceDictionary = getSequenceDictionaryFromProperties(index);
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (sequenceDictionary.size() == 0 && dict != null) {
File indexFile = Tribble.indexFile(inputFile);
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
sequenceDictionary = getSequenceDictionaryFromProperties(index);
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
try { // re-write the index
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
}
if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE )
@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
// this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile);
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict);
return idx;
}
// ---------------------------------------------------------------------------------------------------------
// static functions to work with the sequence dictionaries of indexes
// ---------------------------------------------------------------------------------------------------------
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
/**
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
* (that each contig in the index is in the sequence dictionary).
* @param inputFile for proper error message formatting.
* @param dict the sequence dictionary
* @param index the index file
* @param indexFile the index file
* @param rewriteIndex should we rewrite the index when we're done?
*
*/
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
if (dict == null) return;
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) {
if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null");
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
for (SAMSequenceRecord seq : currentDict.getSequences()) {
if (dict.getSequence(seq.getSequenceName()) == null)
continue;
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
}
// re-write the index
if (rewriteIndex) try {
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict);
// actually update the dictionary in the index
IndexDictionaryUtils.setIndexSequenceDictionary(index, dict);
}
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
if (trackDict == null || trackDict.size() == 0)
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
else {
Set<String> trackSequences = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
trackSequences.add(dictionaryEntry.getSequenceName());
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
}
public void validateTrackSequenceDictionary(final String trackName,
final SAMSequenceDictionary trackDict,
final SAMSequenceDictionary referenceDict ) {
IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType);
}
}

View File

@ -1,57 +0,0 @@
package org.broadinstitute.sting.gatk.refdata.utils;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.*;
/**
*
* @author aaron
*
* Class RMDIntervalGenerator
*
* Creates an interval list, given an RMDTrack
*/
public class RMDIntervalGenerator {
public ReferenceOrderedDataSource dataSource;
/**
* create a interval representation of a ROD track
* @param dataSource the track
*/
public RMDIntervalGenerator(ReferenceOrderedDataSource dataSource) {
if (dataSource == null) throw new IllegalArgumentException("Data source cannot be null");
this.dataSource = dataSource;
}
/**
* create a genome location list from the interval track
* @return a list of genome locations
*/
public List<GenomeLoc> toGenomeLocList() {
Iterator<RODRecordList> iter = dataSource.seek((GenomeLoc)null);
List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
while (iter.hasNext()) {
RODRecordList feature = iter.next();
GenomeLoc loc = feature.getLocation();
if (loc != null) locations.add(loc);
}
return locations;
}
/**
* return a map of reference meta data track names to RODS
* @param sources the reference ordered data sources to get the names from
* @return a map of reference meta data names to RODS
*/
public static Map<String,ReferenceOrderedDataSource> getRMDTrackNames(List<ReferenceOrderedDataSource> sources) {
// get a list of the current rod names we're working with
Map<String,ReferenceOrderedDataSource> rodNames = new HashMap<String,ReferenceOrderedDataSource>();
for (ReferenceOrderedDataSource rod : sources) {
rodNames.put(rod.getName(),rod);
}
return rodNames;
}
}

View File

@ -24,12 +24,14 @@
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.*;
/**
* Tracks a linked list of GATKReportColumn in order by name.
*/
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> {
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
private List<String> columnNames = new ArrayList<String>();
/**
@ -52,4 +54,14 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> {
columnNames.add(key);
return super.put(key, value);
}
@Override
public Iterator<GATKReportColumn> iterator() {
return new Iterator<GATKReportColumn>() {
int offset = 0;
public boolean hasNext() { return offset < columnNames.size() ; }
public GATKReportColumn next() { return getByIndex(offset++); }
public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); }
};
}
}

View File

@ -286,6 +286,10 @@ public class GATKReportTable {
}
}
public boolean containsKey(Object primaryKey) {
return primaryKeyColumn.contains(primaryKey);
}
/**
* Set the value for a given position in the table
*

View File

@ -0,0 +1,46 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
/**
* Categorical sample trait for association and analysis
*
* Samples can have unknown status, be affected or unaffected by the
* categorical trait, or they can be marked as actually having an
* other trait value (stored in an associated value in the Sample class)
*
* @author Mark DePristo
* @since Sept. 2011
*/
public enum Affection {
/** Status is unknown */
UNKNOWN,
/** Suffers from the disease */
AFFECTED,
/** Unaffected by the disease */
UNAFFECTED,
/** An "other" trait: value of the trait is stored elsewhere and is an arbitrary string */
OTHER
}

View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
/**
* ENUM of possible human genders: male, female, or unknown
*/
public enum Gender {
MALE,
FEMALE,
UNKNOWN
}

View File

@ -0,0 +1,310 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
import java.util.*;
/**
* Reads PED file-formatted tabular text files
*
* See http://www.broadinstitute.org/mpg/tagger/faq.html
* See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
*
* The "ped" file format refers to the widely-used format for linkage pedigree data.
* Each line describes a single (diploid) individual in the following format:
*
* family_ID individual_ID father_ID mother_ID gender phenotype genotype_1 genotype_2 ...
*
* If your data lacks pedigree information (for example, unrelated case/control individuals),
* set the father_ID and mother_ID to 0. sex denotes the individual's gender with 1=male and 2=female.
* phenotype refers to the affected status (for association studies) where 0=unknown, 1=unaffected, 2=affected.
* Finally, each genotype is written as two (=diploid) integer numbers (separated by whitespace),
* where 1=A, 2=C, 3=G, 4=T. No header lines are allowed and all columns must be separated by whitespace.
* Check out the information at the PLINK website on the "ped" file format.
*
* The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:
* Family ID
* Individual ID
* Paternal ID
* Maternal ID
* Sex (1=male; 2=female; other=unknown)
* Phenotype
*
* The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person.
* A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a
* quantitative trait or an affection status column: PLINK will automatically detect which type
* (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).
* Note that the GATK actually supports arbitrary values for quantitative trait -- not just doubles --
* and are actually representing these values as strings instead of doubles
*
* NOTE Quantitative traits with decimal points must be coded with a period/full-stop character and
* not a comma, i.e. 2.394 not 2,394
*
* If an individual's sex is unknown, then any character other than 1 or 2 can be used.
* When new files are created (PED, FAM, or other which contain sex) then the original coding will be
* preserved. However, these individuals will be dropped from any analyses (i.e. phenotype set to missing also)
* and an error message will arise if an analysis that uses family information is requested and an
* individual of 'unknown' sex is specified as a father or mother.
*
*
* HINT You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that
* line will be ignored. Do not start any family IDs with this character therefore.
*
* Affection status, by default, should be coded:
* -9 missing
* 0 missing
* 1 unaffected
* 2 affected
*
* If your file is coded 0/1 to represent unaffected/affected, then use the --1 flag:
* plink --file mydata --1 which will specify a disease phenotype coded:
*
* -9 missing
* 0 unaffected
* 1 affected
*
* The missing phenotype value for quantitative traits is, by default, -9 (this can also be used for
* disease traits as well as 0). It can be reset by including the --missing-phenotype option:
*
* Genotypes (column 7 onwards) should also be white-space delimited; they can be any character
* (e.g. 1,2,3,4 or A,C,G,T or anything else) except 0 which is, by default, the missing genotype
* character. All markers should be biallelic. All SNPs (whether haploid or not) must have two
* alleles specified. Either Both alleles should be missing (i.e. 0) or neither.
*
* No header row should be given. For example, here are two individuals typed for 3 SNPs (one row = one person):
*
* FAM001 1 0 0 1 2 A A G G A C
* FAM001 2 0 0 1 2 A A A G 0 0
* ...
*
* Note that the GATK does not support genotypes in a PED file.
*
* @author Mark DePristo
* @since 2011
*/
public class PedReader {
private static Logger logger = Logger.getLogger(PedReader.class);
final static private Set<String> CATAGORICAL_TRAIT_VALUES = new HashSet<String>(Arrays.asList("-9", "0", "1", "2"));
final static private String commentMarker = "#";
/**
* An enum that specifies which, if any, of the standard PED fields are
* missing from the input records. For example, suppose we have the full record:
*
* "fam1 kid dad mom 1 2"
*
* indicating a male affected child. This can be parsed with the -ped x.ped argument
* to the GATK. Suppose we only have:
*
* "fam1 kid 1"
*
* we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped
*/
public enum MissingPedField {
/**
* The PED records do not have the first (FAMILY_ID) argument. The family id
* will be set to null / empty.
*/
NO_FAMILY_ID,
/**
* The PED records do not have either the paternal or maternal IDs, so
* the corresponding IDs are set to null.
*/
NO_PARENTS,
/**
* The PED records do not have the GENDER field, so the sex of each
* sample will be set to UNKNOWN.
*/
NO_SEX,
/**
* The PED records do not have the PHENOTYPE field, so the phenotype
* of each sample will be set to UNKNOWN.
*/
NO_PHENOTYPE
}
protected enum Field {
FAMILY_ID, INDIVIDUAL_ID, PATERNAL_ID, MATERNAL_ID, GENDER, PHENOTYPE
}
// phenotype
private final static String MISSING_VALUE1 = "-9";
private final static String MISSING_VALUE2 = "0";
private final static String PHENOTYPE_UNAFFECTED = "1";
private final static String PHENOTYPE_AFFECTED = "2";
// Sex
private final static String SEX_MALE = "1";
private final static String SEX_FEMALE = "2";
// other=unknown
public PedReader() { }
public final List<Sample> parse(File source, EnumSet<MissingPedField> missingFields, SampleDB sampleDB) throws FileNotFoundException {
logger.info("Reading PED file " + source + " with missing fields: " + missingFields);
return parse(new FileReader(source), missingFields, sampleDB);
}
public final List<Sample> parse(final String source, EnumSet<MissingPedField> missingFields, SampleDB sampleDB) {
logger.warn("Reading PED string: \"" + source + "\" with missing fields: " + missingFields);
return parse(new StringReader(source.replace(";", String.format("%n"))), missingFields, sampleDB);
}
public final List<Sample> parse(Reader reader, EnumSet<MissingPedField> missingFields, SampleDB sampleDB) {
final List<String> lines = new XReadLines(reader).readLines();
// What are the record offsets?
final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0;
final int samplePos = familyPos + 1;
final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1;
final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1;
final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1;
final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1;
final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1;
// go through once and determine properties
int lineNo = 1;
boolean isQT = false;
final List<String[]> splits = new ArrayList<String[]>(lines.size());
for ( final String line : lines ) {
if ( line.startsWith(commentMarker)) continue;
if ( line.trim().equals("") ) continue;
final String[] parts = line.split("\\s+");
if ( parts.length != nExpectedFields )
throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields");
if ( phenotypePos != -1 ) {
isQT = isQT || ! CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]);
}
splits.add(parts);
lineNo++;
}
logger.info("Phenotype is other? " + isQT);
// now go through and parse each record
lineNo = 1;
final List<Sample> samples = new ArrayList<Sample>(splits.size());
for ( final String[] parts : splits ) {
String familyID = null, individualID, paternalID = null, maternalID = null;
Gender sex = Gender.UNKNOWN;
String quantitativePhenotype = Sample.UNSET_QT;
Affection affection = Affection.UNKNOWN;
if ( familyPos != -1 ) familyID = maybeMissing(parts[familyPos]);
individualID = parts[samplePos];
if ( paternalPos != -1 ) paternalID = maybeMissing(parts[paternalPos]);
if ( maternalPos != -1 ) maternalID = maybeMissing(parts[maternalPos]);
if ( sexPos != -1 ) {
if ( parts[sexPos].equals(SEX_MALE) ) sex = Gender.MALE;
else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Gender.FEMALE;
else sex = Gender.UNKNOWN;
}
if ( phenotypePos != -1 ) {
if ( isQT ) {
if ( parts[phenotypePos].equals(MISSING_VALUE1) )
affection = Affection.UNKNOWN;
else {
affection = Affection.OTHER;
quantitativePhenotype = parts[phenotypePos];
}
} else {
if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN;
else if ( parts[phenotypePos].equals(MISSING_VALUE2) ) affection = Affection.UNKNOWN;
else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Affection.UNAFFECTED;
else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Affection.AFFECTED;
else throw new ReviewedStingException("Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo);
}
}
final Sample s = new Sample(individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype);
samples.add(s);
sampleDB.addSample(s);
lineNo++;
}
for ( final Sample sample : new ArrayList<Sample>(samples) ) {
Sample dad = maybeAddImplicitSample(sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE);
if ( dad != null ) samples.add(dad);
Sample mom = maybeAddImplicitSample(sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE);
if ( mom != null ) samples.add(mom);
}
return samples;
}
private final static String maybeMissing(final String string) {
if ( string.equals(MISSING_VALUE1) || string.equals(MISSING_VALUE2) )
return null;
else
return string;
}
private final Sample maybeAddImplicitSample(SampleDB sampleDB, final String id, final String familyID, final Gender gender) {
if ( id != null && sampleDB.getSample(id) == null ) {
Sample s = new Sample(id, sampleDB, familyID, null, null, gender, Affection.UNKNOWN, Sample.UNSET_QT);
sampleDB.addSample(s);
return s;
} else
return null;
}
/**
* Parses a list of tags from the command line, assuming it comes from the GATK Engine
* tags, and returns the corresponding EnumSet.
*
* @param arg the actual engine arg, used for the UserException if there's an error
* @param tags a list of string tags that should be converted to the MissingPedField value
* @return
*/
public static final EnumSet<MissingPedField> parseMissingFieldTags(final Object arg, final List<String> tags) {
final EnumSet<MissingPedField> missingFields = EnumSet.noneOf(MissingPedField.class);
for ( final String tag : tags ) {
try {
missingFields.add(MissingPedField.valueOf(tag));
} catch ( IllegalArgumentException e ) {
throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values());
}
}
return missingFields;
}
}

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
/**
*
*/
public enum PedigreeValidationType {
/**
* Require if a pedigree file is provided at all samples in the VCF or BAM files have a corresponding
* entry in the pedigree file(s).
*/
STRICT,
/**
* Do not enforce any overlap between the VCF/BAM samples and the pedigree data
* */
SILENT
}

View File

@ -0,0 +1,222 @@
package org.broadinstitute.sting.gatk.samples;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.HashMap;
import java.util.Map;
/**
*
*/
public class Sample implements Comparable<Sample> { // implements java.io.Serializable {
final private String familyID, paternalID, maternalID;
final private Gender gender;
final private String otherPhenotype;
final private Affection affection;
final private String ID;
final private SampleDB infoDB;
final private Map<String, Object> properties = new HashMap<String, Object>();
public final static String UNSET_QT = null;
public Sample(final String ID, final SampleDB infoDB,
final String familyID, final String paternalID, final String maternalID,
final Gender gender, final Affection affection, final String otherPhenotype) {
this.familyID = familyID;
this.paternalID = paternalID;
this.maternalID = maternalID;
this.gender = gender;
this.otherPhenotype = otherPhenotype;
this.affection = affection;
this.ID = ID;
this.infoDB = infoDB;
}
protected Sample(final String ID,
final String familyID, final String paternalID, final String maternalID,
final Gender gender, final Affection affection, final String otherPhenotype) {
this(ID, null, familyID, paternalID, maternalID, gender, affection, otherPhenotype);
}
protected Sample(final String ID,
final String familyID, final String paternalID, final String maternalID,
final Gender gender, final Affection affection) {
this(ID, null, familyID, paternalID, maternalID, gender, affection, UNSET_QT);
}
public Sample(final String ID, final SampleDB infoDB,
final String familyID, final String paternalID, final String maternalID, final Gender gender) {
this(ID, infoDB, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT);
}
public Sample(final String ID, final SampleDB infoDB, final Affection affection, final String otherPhenotype) {
this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, otherPhenotype);
}
public Sample(String id, SampleDB infoDB) {
this(id, infoDB, null, null, null,
Gender.UNKNOWN, Affection.UNKNOWN, UNSET_QT);
}
// -------------------------------------------------------------------------------------
//
// standard property getters
//
// -------------------------------------------------------------------------------------
public String getID() {
return ID;
}
public String getFamilyID() {
return familyID;
}
public String getPaternalID() {
return paternalID;
}
public String getMaternalID() {
return maternalID;
}
public Affection getAffection() {
return affection;
}
public boolean hasOtherPhenotype() {
return affection == Affection.OTHER;
}
public String getOtherPhenotype() {
return otherPhenotype;
}
/**
* Get the sample's mother
* @return sample object with relationship mother, if exists, or null
*/
public Sample getMother() {
return infoDB.getSample(maternalID);
}
/**
* Get the sample's father
* @return sample object with relationship father, if exists, or null
*/
public Sample getFather() {
return infoDB.getSample(paternalID);
}
/**
* Get gender of the sample
* @return property of key "gender" - must be of type Gender
*/
public Gender getGender() {
return gender;
}
@Override
public int compareTo(final Sample sample) {
return ID.compareTo(sample.getID());
}
@Override
public String toString() {
return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s",
getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(),
getOtherPhenotype(), properties);
}
// // -------------------------------------------------------------------------------------
// //
// // code for working with additional -- none standard -- properites
// //
// // -------------------------------------------------------------------------------------
//
// public Map<String, Object> getExtraProperties() {
// return Collections.unmodifiableMap(properties);
// }
//
// /**
// * Get one property
// * @param key key of property
// * @return value of property as generic object
// */
// public Object getExtraPropertyValue(final String key) {
// return properties.get(key);
// }
//
// /**
// *
// * @param key property key
// * @return true if sample has this property (even if its value is null)
// */
// public boolean hasExtraProperty(String key) {
// return properties.containsKey(key);
// }
@Override
public int hashCode() {
return ID.hashCode();
}
@Override
public boolean equals(final Object o) {
if(o == null)
return false;
if(o instanceof Sample) {
Sample otherSample = (Sample)o;
return ID.equals(otherSample.ID) &&
equalOrNull(familyID, otherSample.familyID) &&
equalOrNull(paternalID, otherSample.paternalID) &&
equalOrNull(maternalID, otherSample.maternalID) &&
equalOrNull(gender, otherSample.gender) &&
equalOrNull(otherPhenotype, otherSample.otherPhenotype) &&
equalOrNull(affection, otherSample.affection) &&
equalOrNull(properties, otherSample.properties);
}
return false;
}
private final static boolean equalOrNull(final Object o1, final Object o2) {
if ( o1 == null )
return o2 == null;
else
return o2 == null ? false : o1.equals(o2);
}
private final static <T> T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) {
if ( o1 == null || o1.equals(emptyValue) ) {
// take o2 if both are null, otherwise keep o2
return o2 == null ? null : o2;
} else {
if ( o2 == null || o2.equals(emptyValue) )
return o1; // keep o1, since it's a real value
else {
// both o1 and o2 have a value
if ( o1 == o2 )
return o1;
else
throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2);
}
}
}
public final static Sample mergeSamples(final Sample prev, final Sample next) {
if ( prev.equals(next) )
return next;
else {
return new Sample(prev.getID(), prev.infoDB,
mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null),
mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null),
mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null),
mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN),
mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN),
mergeValues(prev.getID(), "OtherPhenotype", prev.getOtherPhenotype(), next.getOtherPhenotype(), UNSET_QT));
//mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap()));
}
}
}

View File

@ -0,0 +1,183 @@
package org.broadinstitute.sting.gatk.samples;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.util.*;
/**
*
*/
public class SampleDB {
/**
* This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so
* this is stored as a HashMap.
*/
private final HashMap<String, Sample> samples = new HashMap<String, Sample>();
/**
* Constructor takes both a SAM header and sample files because the two must be integrated.
*/
public SampleDB() {
}
/**
* Protected function to add a single sample to the database
*
* @param sample to be added
*/
protected SampleDB addSample(Sample sample) {
Sample prev = samples.get(sample.getID());
if ( prev != null )
sample = Sample.mergeSamples(prev, sample);
samples.put(sample.getID(), sample);
return this;
}
// --------------------------------------------------------------------------------
//
// Functions for getting a sample from the DB
//
// --------------------------------------------------------------------------------
/**
* Get a sample by its ID
* If an alias is passed in, return the main sample object
* @param id
* @return sample Object with this ID, or null if this does not exist
*/
public Sample getSample(String id) {
return samples.get(id);
}
/**
*
* @param read
* @return sample Object with this ID, or null if this does not exist
*/
public Sample getSample(final SAMRecord read) {
return getSample(read.getReadGroup());
}
/**
*
* @param rg
* @return sample Object with this ID, or null if this does not exist
*/
public Sample getSample(final SAMReadGroupRecord rg) {
return getSample(rg.getSample());
}
/**
* @param g Genotype
* @return sample Object with this ID, or null if this does not exist
*/
public Sample getSample(final Genotype g) {
return getSample(g.getSampleName());
}
// --------------------------------------------------------------------------------
//
// Functions for accessing samples in the DB
//
// --------------------------------------------------------------------------------
/**
* Get number of sample objects
* @return size of samples map
*/
public int sampleCount() {
return samples.size();
}
public Set<Sample> getSamples() {
return new HashSet<Sample>(samples.values());
}
public Collection<String> getSampleNames() {
return Collections.unmodifiableCollection(samples.keySet());
}
/**
* Takes a collection of sample names and returns their corresponding sample objects
* Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set
* @param sampleNameList Set of sample names
* @return Corresponding set of samples
*/
public Set<Sample> getSamples(Collection<String> sampleNameList) {
HashSet<Sample> samples = new HashSet<Sample>();
for (String name : sampleNameList) {
try {
samples.add(getSample(name));
}
catch (Exception e) {
throw new StingException("Could not get sample with the following ID: " + name, e);
}
}
return samples;
}
// --------------------------------------------------------------------------------
//
// Higher level pedigree functions
//
// --------------------------------------------------------------------------------
/**
* Returns a sorted set of the family IDs in all samples (excluding null ids)
* @return
*/
public final Set<String> getFamilyIDs() {
return getFamilies().keySet();
}
/**
* Returns a map from family ID -> set of family members for all samples with
* non-null family ids
*
* @return
*/
public final Map<String, Set<Sample>> getFamilies() {
final Map<String, Set<Sample>> families = new TreeMap<String, Set<Sample>>();
for ( final Sample sample : samples.values() ) {
final String famID = sample.getFamilyID();
if ( famID != null ) {
if ( ! families.containsKey(famID) )
families.put(famID, new TreeSet<Sample>());
families.get(famID).add(sample);
}
}
return families;
}
/**
* Return all samples with a given family ID
* @param familyId
* @return
*/
public Set<Sample> getFamily(String familyId) {
return getFamilies().get(familyId);
}
/**
* Returns all children of a given sample
* See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
* @param sample
* @return
*/
public Set<Sample> getChildren(Sample sample) {
final HashSet<Sample> children = new HashSet<Sample>();
for ( final Sample familyMember : getFamily(sample.getFamilyID())) {
if ( familyMember.getMother() == sample || familyMember.getFather() == sample ) {
children.add(familyMember);
}
}
return children;
}
}

View File

@ -0,0 +1,153 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
*
*/
public class SampleDBBuilder {
PedigreeValidationType validationStrictness;
final SampleDB sampleDB = new SampleDB();
final GenomeAnalysisEngine engine;
Set<Sample> samplesFromDataSources = new HashSet<Sample>();
Set<Sample> samplesFromPedigrees = new HashSet<Sample>();
/** for testing only */
protected SampleDBBuilder(PedigreeValidationType validationStrictness) {
engine = null;
this.validationStrictness = validationStrictness;
}
/**
* Constructor takes both a SAM header and sample files because the two must be integrated.
*/
public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) {
this.engine = engine;
this.validationStrictness = validationStrictness;
}
/**
* Hallucinates sample objects for all the samples in the SAM file and stores them
*/
public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) {
addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header));
return this;
}
public SampleDBBuilder addSamplesFromSampleNames(final Collection<String> sampleNames) {
for (final String sampleName : sampleNames) {
if (sampleDB.getSample(sampleName) == null) {
final Sample newSample = new Sample(sampleName, sampleDB);
sampleDB.addSample(newSample);
samplesFromDataSources.add(newSample); // keep track of data source samples
}
}
return this;
}
public SampleDBBuilder addSamplesFromPedigreeFiles(final List<File> pedigreeFiles) {
for (final File pedFile : pedigreeFiles) {
Collection<Sample> samples = addSamplesFromPedigreeArgument(pedFile);
samplesFromPedigrees.addAll(samples);
}
return this;
}
public SampleDBBuilder addSamplesFromPedigreeStrings(final List<String> pedigreeStrings) {
for (final String pedString : pedigreeStrings) {
Collection<Sample> samples = addSamplesFromPedigreeArgument(pedString);
samplesFromPedigrees.addAll(samples);
}
return this;
}
/**
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
private Collection<Sample> addSamplesFromPedigreeArgument(File sampleFile) {
final PedReader reader = new PedReader();
try {
return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(sampleFile, e);
}
}
private Collection<Sample> addSamplesFromPedigreeArgument(final String string) {
final PedReader reader = new PedReader();
return reader.parse(string, getMissingFields(string), sampleDB);
}
public SampleDB getFinalSampleDB() {
validate();
return sampleDB;
}
public EnumSet<PedReader.MissingPedField> getMissingFields(final Object engineArg) {
if ( engine == null )
return EnumSet.noneOf(PedReader.MissingPedField.class);
else {
final List<String> posTags = engine.getTags(engineArg).getPositionalTags();
return PedReader.parseMissingFieldTags(engineArg, posTags);
}
}
// --------------------------------------------------------------------------------
//
// Validation
//
// --------------------------------------------------------------------------------
protected final void validate() {
if ( validationStrictness == PedigreeValidationType.SILENT )
return;
else {
// check that samples in data sources are all annotated, if anything is annotated
if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
final Set<String> sampleNamesFromPedigrees = new HashSet<String>();
for ( final Sample pSample : samplesFromPedigrees )
sampleNamesFromPedigrees.add(pSample.getID());
for ( final Sample dsSample : samplesFromDataSources )
if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) )
throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation");
}
}
}
}

View File

@ -364,8 +364,8 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
// count up the number of skipped reads by summing over all filters
long nSkippedReads = 0L;
for ( Map.Entry<Class, Long> countsByFilter: cumulativeMetrics.getCountsByFilter().entrySet())
nSkippedReads += countsByFilter.getValue();
for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values())
nSkippedReads += countsByFilter;
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", elapsed, elapsed / 60, elapsed / 3600));
if ( cumulativeMetrics.getNumReadsSeen() > 0 )
@ -373,10 +373,10 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
nSkippedReads,
cumulativeMetrics.getNumReadsSeen(),
100.0 * MathUtils.ratio(nSkippedReads,cumulativeMetrics.getNumReadsSeen())));
for ( Map.Entry<Class, Long> filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) {
for ( Map.Entry<String, Long> filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) {
long count = filterCounts.getValue();
logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s",
count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), Utils.getClassName(filterCounts.getKey())));
count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey()));
}
if ( performanceLog != null ) performanceLog.close();

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
@ -57,9 +58,9 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
return "dups";
}
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
private List<GATKSAMRecord> readsAtLoc(final GATKSAMRecord read, PushbackIterator<SAMRecord> iter) {
GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read);
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
ArrayList<GATKSAMRecord> l = new ArrayList<GATKSAMRecord>();
l.add(read);
for (SAMRecord read2 : iter) {
@ -70,7 +71,7 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
iter.pushback(read2);
break;
} else {
l.add(read2);
l.add((GATKSAMRecord) read2);
}
}
@ -84,15 +85,15 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
* @param reads the list of reads to split into unique molecular samples
* @return
*/
protected Set<List<SAMRecord>> uniqueReadSets(List<SAMRecord> reads) {
Set<List<SAMRecord>> readSets = new LinkedHashSet<List<SAMRecord>>();
protected Set<List<GATKSAMRecord>> uniqueReadSets(List<GATKSAMRecord> reads) {
Set<List<GATKSAMRecord>> readSets = new LinkedHashSet<List<GATKSAMRecord>>();
// for each read, find duplicates, and either add the read to its duplicate list or start a new one
for ( SAMRecord read : reads ) {
List<SAMRecord> readSet = findDuplicateReads(read, readSets);
for ( GATKSAMRecord read : reads ) {
List<GATKSAMRecord> readSet = findDuplicateReads(read, readSets);
if ( readSet == null ) {
readSets.add(new ArrayList<SAMRecord>(Arrays.asList(read))); // copy so I can add to the list
readSets.add(new ArrayList<GATKSAMRecord>(Arrays.asList(read))); // copy so I can add to the list
} else {
readSet.add(read);
}
@ -110,13 +111,13 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
* @param readSets
* @return The list of duplicate reads that read is a member of, or null if it's the only one of its kind
*/
protected List<SAMRecord> findDuplicateReads(SAMRecord read, Set<List<SAMRecord>> readSets ) {
protected List<GATKSAMRecord> findDuplicateReads(GATKSAMRecord read, Set<List<GATKSAMRecord>> readSets ) {
if ( read.getReadPairedFlag() ) {
// paired
final GenomeLoc readMateLoc = engine.getGenomeLocParser().createGenomeLoc(read.getMateReferenceName(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
for (List<SAMRecord> reads : readSets) {
SAMRecord key = reads.get(0);
for (List<GATKSAMRecord> reads : readSets) {
GATKSAMRecord key = reads.get(0);
// read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate
@ -131,8 +132,8 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
}
}
} else {
for (List<SAMRecord> reads : readSets) {
SAMRecord key = reads.get(0);
for (List<GATKSAMRecord> reads : readSets) {
GATKSAMRecord key = reads.get(0);
boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength();
//System.out.printf("%s %s %b %b %d %d %d %d => %b%n",
// read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(),
@ -179,7 +180,7 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
// get the genome loc from the read
GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read);
Set<List<SAMRecord>> readSets = uniqueReadSets(readsAtLoc(read, iter));
Set<List<GATKSAMRecord>> readSets = uniqueReadSets(readsAtLoc((GATKSAMRecord) read, iter));
if ( DEBUG ) logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size()));
// Jump forward in the reference to this locus location

View File

@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
@ -100,9 +101,9 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
// if the read is mapped, create a metadata tracker
ReadMetaDataTracker tracker = (read.getReferenceIndex() >= 0) ? rodView.getReferenceOrderedDataForRead(read) : null;
final boolean keepMeP = walker.filter(refContext, read);
final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read);
if (keepMeP) {
M x = walker.map(refContext, read, tracker); // the tracker can be null
M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null
sum = walker.reduce(x, sum);
}

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
@ -43,6 +42,7 @@ import org.broadinstitute.sting.utils.clipreads.ClippingOp;
import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.File;
@ -292,11 +292,12 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
/**
* The reads map function.
*
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @param read the read itself, as a GATKSAMRecord
* @return the ReadClipper object describing what should be done to clip this read
*/
public ReadClipperWithData map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) {
read = ReadUtils.replaceSoftClipsWithMatches(read);
@ -323,7 +324,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
*/
private void clipSequences(ReadClipperWithData clipper) {
if (sequencesToClip != null) { // don't bother if we don't have any sequences to clip
SAMRecord read = clipper.getRead();
GATKSAMRecord read = clipper.getRead();
ClippingData data = clipper.getData();
for (SeqToClip stc : sequencesToClip) {
@ -360,7 +361,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
* @param stop
* @return
*/
private Pair<Integer, Integer> strandAwarePositions(SAMRecord read, int start, int stop) {
private Pair<Integer, Integer> strandAwarePositions(GATKSAMRecord read, int start, int stop) {
if (read.getReadNegativeStrandFlag())
return new Pair<Integer, Integer>(read.getReadLength() - stop - 1, read.getReadLength() - start - 1);
else
@ -374,7 +375,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
*/
private void clipCycles(ReadClipperWithData clipper) {
if (cyclesToClip != null) {
SAMRecord read = clipper.getRead();
GATKSAMRecord read = clipper.getRead();
ClippingData data = clipper.getData();
for (Pair<Integer, Integer> p : cyclesToClip) { // iterate over each cycle range
@ -416,7 +417,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
* @param clipper
*/
private void clipBadQualityScores(ReadClipperWithData clipper) {
SAMRecord read = clipper.getRead();
GATKSAMRecord read = clipper.getRead();
ClippingData data = clipper.getData();
int readLen = read.getReadBases().length;
byte[] quals = read.getBaseQualities();
@ -458,7 +459,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
if ( clipper == null )
return data;
SAMRecord clippedRead = clipper.clipRead(clippingRepresentation);
GATKSAMRecord clippedRead = clipper.clipRead(clippingRepresentation);
if (outputBam != null) {
outputBam.addAlignment(clippedRead);
} else {
@ -575,7 +576,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
public class ReadClipperWithData extends ReadClipper {
private ClippingData data;
public ReadClipperWithData(SAMRecord read, List<SeqToClip> clipSeqs) {
public ReadClipperWithData(GATKSAMRecord read, List<SeqToClip> clipSeqs) {
super(read);
data = new ClippingData(clipSeqs);
}

View File

@ -1,10 +1,10 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.List;
import java.util.Set;
@ -20,11 +20,11 @@ import java.util.Set;
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class})
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<GATKSAMRecord>> readSets ) {
return true; // We are keeping all the reads
}
public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets );
public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<GATKSAMRecord>> readSets );
// Given result of map function
public abstract ReduceType reduceInit();

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
@ -71,21 +72,23 @@ public class FindReadsWithNamesWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* The reads filter function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @return true if the read passes the filter, false if it doesn't
*/
public boolean filter(ReferenceContext ref, SAMRecord read) {
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
return namesToKeep.contains(read.getReadName());
}
/**
* The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @return the read itself
*/
public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) {
public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
return read;
}

View File

@ -1,9 +1,9 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.text.DecimalFormat;
@ -119,7 +119,7 @@ public class FlagStatWalker extends ReadWalker<Integer, Integer> {
private FlagStat myStat = new FlagStat();
public Integer map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) {
public Integer map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
myStat.readCount++;
if (read.getReadFailsVendorQualityCheckFlag()) {
myStat.QC_failure++;

View File

@ -17,7 +17,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
*/
@By(DataSource.READS)
@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.INTERVAL)
@PartitionBy(PartitionType.LOCUS)
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?

View File

@ -34,6 +34,12 @@ public enum PartitionType {
*/
NONE,
/**
* The walker inputs can be chunked down to individual
* reads.
*/
READ,
/**
* The walker inputs can be chunked down to the
* per-locus level.

View File

@ -40,6 +40,7 @@ import java.util.TreeSet;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
@ -136,11 +137,12 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
/**
* The reads filter function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @return true if the read passes the filter, false if it doesn't
*/
public boolean filter(ReferenceContext ref, SAMRecord read) {
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
// check the read group
if ( readGroup != null ) {
SAMReadGroupRecord myReadGroup = read.getReadGroup();
@ -180,11 +182,12 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
/**
* The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @return the read itself
*/
public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) {
public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
return read;
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* Created by IntelliJ IDEA.
@ -12,7 +13,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.CONTIG)
@PartitionBy(PartitionType.READ)
public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
public boolean requiresOrderedReads() { return false; }
@ -20,11 +21,11 @@ public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, Re
/** Must return true for reads that need to be processed. Reads, for which this method return false will
* be skipped by the engine and never passed to the walker.
*/
public boolean filter(ReferenceContext ref, SAMRecord read) {
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
// We are keeping all the reads
return true;
}
// Map over the org.broadinstitute.sting.gatk.contexts.AlignmentContext
public abstract MapType map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker);
public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker);
}

View File

@ -33,6 +33,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.ArrayList;
@ -60,7 +61,7 @@ public class SplitSamFileWalker extends ReadWalker<SAMRecord, Map<String, SAMFil
logger.info("SplitSamFile version: " + VERSION);
}
public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
public SAMRecord map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
return read;
}

View File

@ -25,15 +25,17 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.MalformedReadFilter;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.GenericDocumentationHandler;
import java.util.List;
@ -77,6 +79,23 @@ public abstract class Walker<MapType, ReduceType> {
return toolkit;
}
/**
* Gets the master sequence dictionary for this walker
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
*/
protected SAMSequenceDictionary getMasterSequenceDictionary() {
return getToolkit().getMasterSequenceDictionary();
}
protected SampleDB getSampleDB() {
return getToolkit().getSampleDB();
}
protected Sample getSample(final String id) {
return getToolkit().getSampleDB().getSample(id);
}
/**
* (conceptual static) method that states whether you want to see reads piling up at a locus
* that contain a deletion at the locus.

View File

@ -92,7 +92,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
continue;
}
// todo -- actually care about indel length from the pileup (agnostic at the moment)
int refCount = indelPileup.size();
int refCount = indelPileup.getNumberOfElements();
int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions();
if ( refCount + altCount == 0 ) {

View File

@ -47,7 +47,7 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
if (!g.isHet())
return null;
Set<Allele> altAlleles = vc.getAlternateAlleles();
Collection<Allele> altAlleles = vc.getAlternateAlleles();
if ( altAlleles.size() == 0 )
return null;

Some files were not shown because too many files have changed in this diff Show More