Created a new playground script for cleaning bams in Firehose.

Some refactoring of Queue extensions for reusability in scripts.
Putting the extensions into the Queue.jar after building them.
More updates to GATK walker arguments specifying @Input and @Output for Queue.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4032 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2010-08-13 23:52:24 +00:00
parent dfe2922b5e
commit 307c8ca027
8 changed files with 211 additions and 55 deletions

View File

@ -12,8 +12,8 @@
<property name="queue.source.dir" value="scala/src" />
<property name="queue.classes" value="${build.dir}/scala/classes" />
<property name="queue-gatk-extensions.source.dir" value="${build.dir}/queue-gatk-extensions/src" />
<property name="queue-gatk-extensions.classes" value="${build.dir}/queue-gatk-extensions/classes" />
<property name="queue-extensions.source.dir" value="${build.dir}/queue-extensions/src" />
<property name="queue-extensions.classes" value="${build.dir}/queue-extensions/classes" />
<!-- where to find the tribble distro -->
<property name="tribble.dir" value="tribble" />
@ -66,8 +66,8 @@
<pathelement location="${java.classes}" />
</path>
<!-- Path to queue-gatk-extendsions dependencies. -->
<path id="queue-gatk-extensions.dependencies">
<!-- Path to queue-extendsions dependencies. -->
<path id="queue-extensions.dependencies">
<path refid="runtime.dependencies" />
<pathelement location="${java.classes}" />
<pathelement location="${queue.classes}" />
@ -113,7 +113,7 @@
</target>
<target name="init.buildall">
<!-- Set the properties needed to build Queue and the Queue GATK Extensions -->
<!-- Set the properties needed to build Queue -->
<property name="gatk.target" value="oneoffs"/>
<property name="queue.target" value="core"/>
</target>
@ -143,7 +143,7 @@
<equals arg1="${env.QUEUE_BUILD_TYPE}" arg2="$${env.QUEUE_BUILD_TYPE}" />
</condition>
<!-- If the queue target is set, or if the queue-gatk-extensions needs to be built, then include all queue tasks. -->
<!-- If the queue target is set, or if the queue-extensions needs to be built, then include all queue tasks. -->
<condition property="queue.include">
<or>
<not><equals arg1="${queue.target}" arg2="none" /></not>
@ -211,21 +211,21 @@
</target>
<!-- NOTE: Extracting help first to avoid "Unable to load help text. Help output will be sparse." warning message. -->
<target name="queue-gatk-extensions.generate" depends="gatk.compile, queue.compile, extracthelp" if="queue.include" description="generate GATK modules for Queue">
<mkdir dir="${queue-gatk-extensions.source.dir}"/>
<target name="queue-extensions.generate" depends="gatk.compile, queue.compile, extracthelp" if="queue.include" description="generate GATK modules for Queue">
<mkdir dir="${queue-extensions.source.dir}"/>
<echo>Generating Queue GATK extensions...</echo>
<java fork="true" failonerror="true" classname="org.broadinstitute.sting.queue.extensions.gatk.GATKExtensionsGenerator" classpathref="queue-gatk-extensions.dependencies">
<java fork="true" failonerror="true" classname="org.broadinstitute.sting.queue.extensions.gatk.GATKExtensionsGenerator" classpathref="queue-extensions.dependencies">
<arg value="-outDir" />
<arg path="${queue-gatk-extensions.source.dir}" />
<arg path="${queue-extensions.source.dir}" />
<arg value="-l" />
<arg value="WARN" />
</java>
</target>
<target name="queue-gatk-extensions.compile" depends="queue-gatk-extensions.generate" if="queue.include" description="compile GATK modules for Queue">
<mkdir dir="${queue-gatk-extensions.classes}"/>
<target name="queue-extensions.compile" depends="queue-extensions.generate" if="queue.include" description="compile extensions for Queue">
<mkdir dir="${queue-extensions.classes}"/>
<echo>Building Queue GATK extensions...</echo>
<scalac srcdir="${queue-gatk-extensions.source.dir}" destdir="${queue-gatk-extensions.classes}" classpathref="queue-gatk-extensions.dependencies" deprecation="yes" unchecked="yes">
<scalac srcdir="${queue-extensions.source.dir}" destdir="${queue-extensions.classes}" classpathref="queue-extensions.dependencies" deprecation="yes" unchecked="yes">
<include name="**/*.scala"/>
</scalac>
</target>
@ -245,7 +245,7 @@
</javadoc>
</target>
<target name="sting.compile" depends="gatk.compile, queue.compile, queue-gatk-extensions.compile" />
<target name="sting.compile" depends="gatk.compile, queue.compile, queue-extensions.compile" />
<target name="init.jar" depends="sting.compile,extracthelp">
<mkdir dir="${dist.dir}"/>
@ -295,21 +295,21 @@
</target>
<target name="queue.jar" depends="queue.compile, queue-gatk-extensions.compile, init.jar" if="queue.include">
<target name="queue.jar" depends="queue.compile, queue-extensions.compile, init.jar" if="queue.include">
<jar jarfile="${dist.dir}/Queue.jar">
<fileset dir="${queue.classes}">
<include name="org/broadinstitute/sting/queue/**/*.class"/>
</fileset>
<fileset dir="${java.classes}">
<include name="org/broadinstitute/sting/queue/**/*.class" />
</fileset>
<fileset dir="${queue-extensions.classes}">
<include name="**/*.class" />
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.queue.QCommandLine" />
</manifest>
</jar>
<jar jarfile="${dist.dir}/QueueGATKExtensions.jar">
<fileset dir="${queue-gatk-extensions.classes}">
<include name="**/*.class" />
</fileset>
</jar>
</target>
<target name="sting.jar" depends="sting-utils.jar, gatk.jar, queue.jar" />
@ -352,12 +352,6 @@
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>
</jar>
<jar jarfile="${dist.dir}/QueueGATKExtensions.jar" update="true" >
<manifest>
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>
</jar>
</target>
<target name="sting.manifests" depends="sting-utils.manifests, gatk.manifests, queue.manifests" />
@ -556,7 +550,7 @@
<delete dir="javadoc"/>
</target>
<target name="javadoc" depends="init.buildall,resolve,queue-gatk-extensions.generate" description="generates javadoc">
<target name="javadoc" depends="init.buildall,resolve,queue-extensions.generate" description="generates javadoc">
<mkdir dir="javadoc"/>
<javadoc destdir="javadoc"
classpathref="runtime.dependencies">
@ -564,14 +558,11 @@
</javadoc>
<mkdir dir="javadoc/queue"/>
<scaladoc srcdir="${queue.source.dir}" destdir="javadoc/queue" classpathref="queue.dependencies" deprecation="yes" unchecked="yes">
<scaladoc srcdir="" destdir="javadoc/queue" classpathref="queue-extensions.dependencies" deprecation="yes" unchecked="yes">
<src path="${queue.source.dir}"/>
<src path="${queue-extensions.source.dir}"/>
<include name="org/broadinstitute/sting/queue/**/*.scala"/>
</scaladoc>
<mkdir dir="javadoc/queue-gatk-extensions"/>
<scaladoc srcdir="${queue-gatk-extensions.source.dir}" destdir="javadoc/queue-gatk-extensions" classpathref="queue-gatk-extensions.dependencies" deprecation="yes" unchecked="yes">
<include name="org/broadinstitute/sting/queue/extensions/**/*.scala"/>
</scaladoc>
</target>
<!-- Unzip all classes from their current locations and assemble them in a staging directory -->

View File

@ -171,7 +171,7 @@ public class GATKArgumentCollection {
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
@ElementList(required = false)
@Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
public List<String> readGroupBlackList = null;
/**

View File

@ -28,7 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
import net.sf.samtools.util.StringUtil;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -44,8 +44,6 @@ import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineUtils;
import java.io.File;
import java.io.FileWriter;
@ -63,7 +61,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
public static final String ORIGINAL_POSITION_TAG = "OP";
public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner";
@Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
@Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
protected String intervalsFile = null;
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
@ -72,7 +70,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
protected double MISMATCH_THRESHOLD = 0.15;
@Argument(fullName="output", shortName="O", required=false, doc="Output bam")
@Output(fullName="output", shortName="O", required=false, doc="Output bam")
protected String writerFilename = null;
@Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]")
@ -115,15 +113,15 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// DEBUGGING OPTIONS FOLLOW
@Hidden
@Argument(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
@Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
protected String OUT_INDELS = null;
@Hidden
@Argument(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false)
@Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false)
protected String OUT_STATS = null;
@Hidden
@Argument(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false)
@Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false)
protected String OUT_SNPS = null;
// the intervals input by the user

View File

@ -11,8 +11,8 @@
<package name="org.broadinstitute.sting.queue.function.*" />
<package name="org.broadinstitute.sting.queue.util" />
<!-- Queue GATK Extensions -->
<package name="org.broadinstitute.sting.queue.extensions.gatk" />
<!-- Queue Extensions -->
<package name="org.broadinstitute.sting.queue.extensions.*" />
</dependencies>
</executable>
</package>

View File

@ -0,0 +1,143 @@
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
class CleanBamFile extends QScript {
qscript =>
@Argument(doc="gatk jar", shortName="gatk")
var gatkJar: File = _
@Argument(doc="fix mates jar", shortName="fixMates")
var fixMatesJar: File = _
@Input(doc="Script that can merge text files, for example Sting/shell/mergeText.sh.", shortName="MTS")
var mergeTextScript: String = _
@Argument(doc="base name for output files", shortName="base")
var baseName: String = _
@Input(doc="reference genome", shortName="R")
var referenceFile: File = _
@Input(doc="recalibrated bam", shortName="I")
var recalibratedBam: File = _
@Argument(doc="read group blacklist conversion script that can convert firehose outputs to a GATK blacklist file.", shortName="RGBLS")
var readGroupBlackListScript: String = _
@Argument(doc="read group blacklist", shortName="RGBL", required=false)
var readGroupBlackList: String = _
@Argument(doc="intervals", shortName="L", required=false)
var intervals: File = _
@Argument(doc="Script that can split the interval file by contig, for example Sting/python/splitIntervalsByContig.py.", shortName="RTCSS")
var targetCreatorScatterScript: String = _
@Argument(doc="RealignerTargetCreator scatter count. " +
"Best if it is either 1 or the number of contigs in the interval list. " +
"If used the compute farm must also be used.", shortName="RTCSC")
var targetCreatorScatterCount = 0
@Argument(doc="Script that can split the intervals evenly, for example Sting/shell/splitIntervals.sh.", shortName="IRSS")
var indelRealignerScatterScript: String = _
@Argument(doc="IndelRealigner scatter count.", shortName="IRSC")
var indelRealignerScatterCount = 0
@Input(doc="dbsnp file", shortName="D")
var dbsnpFile: File = _
trait GATKCommonArgs extends CommandLineGATK {
this.jarFile = qscript.gatkJar
this.reference_sequence = qscript.referenceFile
this.intervals = qscript.intervals
this.input_file :+= recalibratedBam
this.cleanupTempDirectories = true
}
def baseFile(suffix: String) = new File(baseName + suffix)
def script = {
val blacklistConverter = new CommandLineFunction {
@Output(doc="blacklist file") var blacklistFile: File = _
def commandLine = readGroupBlackListScript + " " + blacklistFile + " " + readGroupBlackList
}
if (readGroupBlackList != null) {
blacklistConverter.blacklistFile = baseFile(".blacklist.txt")
add(blacklistConverter)
}
// -T RealignerTargetCreator -I <input.bam> -R <reference.genome> <interval.list> <blacklist.file> -o <base.name>.merged.intervals
val targetCreator = new RealignerTargetCreator with GATKCommonArgs
targetCreator.memoryLimit = Some(2)
targetCreator.read_group_black_list :+= blacklistConverter.blacklistFile
targetCreator.out = baseFile(".merged.intervals")
targetCreator.scatterCount = targetCreatorScatterCount
targetCreator.setupScatterFunction = {
case (scatter: IntervalScatterFunction, _) =>
scatter.splitIntervalsScript = targetCreatorScatterScript
}
targetCreator.setupGatherFunction = {
case (gather: SimpleTextGatherFunction, _) =>
gather.mergeTextScript = mergeTextScript
}
// -T IndelRealigner -I <input.bam> -R <reference.genome> <blacklist.file> -stats <base.name>.indel.stats
// -O <base.name>.unfixed.cleaned.bam -maxInRam 200000 -targetIntervals <merged.intervals> -D <dbsnp.file>
val realigner = new IndelRealigner with GATKCommonArgs
realigner.memoryLimit = Some(4)
realigner.read_group_black_list :+= blacklistConverter.blacklistFile
realigner.statisticsFileForDebugging = baseFile(".indel.stats")
realigner.maxReadsInRam = Some(200000)
realigner.targetIntervals = targetCreator.out
realigner.DBSNP = dbsnpFile
realigner.scatterCount = indelRealignerScatterCount
val bamIndex = new BamIndexFunction
if (realigner.scatterCount > 1) {
realigner.output = baseFile(".cleaned.bam")
// While gathering run fix mates.
realigner.setupScatterFunction = {
case (scatter: IntervalScatterFunction, _) =>
scatter.splitIntervalsScript = indelRealignerScatterScript
}
realigner.setupGatherFunction = {
case (gather: PicardBamJarFunction, _) =>
gather.memoryLimit = Some(4)
gather.jarFile = fixMatesJar
// Don't pass this AS=true to fix mates!
gather.assumeSorted = None
case (gather: SimpleTextGatherFunction, _) =>
gather.mergeTextScript = mergeTextScript
}
bamIndex.bamFile = realigner.output
} else {
realigner.output = baseFile(".unfixed.cleaned.bam")
// Explicitly run fix mates if the function won't be scattered.
var fixMates = new PicardBamJarFunction {
// Declare inputs/outputs for dependency tracking.
@Input(doc="unfixed bam") var unfixed: File = _
@Output(doc="fixed bam") var fixed: File = _
def inputBams = List(unfixed)
def outputBam = fixed
}
fixMates.memoryLimit = Some(4)
fixMates.jarFile = fixMatesJar
fixMates.unfixed = realigner.output
fixMates.fixed = baseFile(".cleaned.bam")
bamIndex.bamFile = fixMates.fixed
// Add the fix mates explicitly
add(fixMates)
}
add(targetCreator, realigner, bamIndex)
}
}

View File

@ -12,7 +12,7 @@ class QSettings {
var jobNamePrefix: String = QSettings.processNamePrefix
@Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false)
var jobQueue: String = "broad"
var jobQueue: String = _
@Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false)
var jobProject: String = "Queue"

View File

@ -1,17 +1,14 @@
package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.JarCommandLineFunction
import org.broadinstitute.sting.commandline.Argument
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
/**
* Merges BAM files using Picards MergeSampFiles.jar.
* Merges BAM files using Picards MergeSamFiles.jar.
* At the Broad the jar can be found at /seq/software/picard/current/bin/MergeSamFiles.jar. Outside the broad see http://picard.sourceforge.net/")
*/
class BamGatherFunction extends GatherFunction with JarCommandLineFunction {
@Argument(doc="Compression level 1-9", required=false)
var compressionLevel: Option[Int] = None
override def commandLine = super.commandLine + "%s%s%s".format(
optional(" COMPRESSION_LEVEL=", compressionLevel), " AS=true VALIDATION_STRINGENCY=SILENT SO=coordinate OUTPUT=" + originalOutput, repeat(" INPUT=", gatherParts))
class BamGatherFunction extends GatherFunction with PicardBamJarFunction {
this.assumeSorted = Some(true)
protected def inputBams = gatherParts
protected def outputBam = originalOutput
}

View File

@ -0,0 +1,27 @@
package org.broadinstitute.sting.queue.extensions.picard
import org.broadinstitute.sting.queue.function.JarCommandLineFunction
import java.io.File
/**
* Wraps a Picard jar that operates on BAM files.
* See http://picard.sourceforge.net/ for more info.
*
* Since the jar files take slightly different arguments
* some values are optional.
*/
trait PicardBamJarFunction extends JarCommandLineFunction {
var validationStringency = "SILENT"
var sortOrder = "coordinate"
var compressionLevel: Option[Int] = None
var maxRecordsInRam: Option[Int] = None
var assumeSorted: Option[Boolean] = None
protected def inputBams: List[File]
protected def outputBam: File
override def commandLine = super.commandLine + "%s%s%s".format(
optional(" COMPRESSION_LEVEL=", compressionLevel), optional(" VALIDATION_STRINGENCY=", validationStringency),
optional(" SO=", sortOrder), optional( " MAX_RECORDS_IN_RAM=", maxRecordsInRam), optional(" ASSUME_SORTED=", assumeSorted),
" OUTPUT=" + outputBam, repeat(" INPUT=", inputBams), " TMP_DIR=" + jobTempDir)
}