From 307c8ca02782c1f82ac5920664ec9fdf85723928 Mon Sep 17 00:00:00 2001 From: kshakir Date: Fri, 13 Aug 2010 23:52:24 +0000 Subject: [PATCH] Created a new playground script for cleaning bams in Firehose. Some refactoring of Queue extensions for reusability in scripts. Putting the extensions into the Queue.jar after building them. More updates to GATK walker arguments specifying @Input and @Output for Queue. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4032 348d0f76-0448-11de-a6fe-93d51630548a --- build.xml | 59 +++----- .../arguments/GATKArgumentCollection.java | 2 +- .../gatk/walkers/indels/IndelRealigner.java | 14 +- packages/Queue.xml | 4 +- scala/qscript/kshakir/CleanBamFile.scala | 143 ++++++++++++++++++ .../sting/queue/QSettings.scala | 2 +- .../extensions/gatk/BamGatherFunction.scala | 15 +- .../picard/PicardBamJarFunction.scala | 27 ++++ 8 files changed, 211 insertions(+), 55 deletions(-) create mode 100644 scala/qscript/kshakir/CleanBamFile.scala create mode 100644 scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala diff --git a/build.xml b/build.xml index d640e6339..224bb6417 100644 --- a/build.xml +++ b/build.xml @@ -12,8 +12,8 @@ - - + + @@ -66,8 +66,8 @@ - - + + @@ -113,7 +113,7 @@ - + @@ -143,7 +143,7 @@ - + @@ -211,21 +211,21 @@ - - + + Generating Queue GATK extensions... - + - + - - + + Building Queue GATK extensions... - + @@ -245,7 +245,7 @@ - + @@ -295,21 +295,21 @@ - + + + + + + + - - - - - - @@ -352,12 +352,6 @@ - - - - - - @@ -556,7 +550,7 @@ - + @@ -564,14 +558,11 @@ - + + + - - - - - diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index fe8821a61..33e8d3a22 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -171,7 +171,7 @@ public class GATKArgumentCollection { public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; @ElementList(required = false) - @Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 153138768..ac8a1a384 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -28,7 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.*; import net.sf.samtools.util.StringUtil; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -44,8 +44,6 @@ import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineUtils; import java.io.File; import java.io.FileWriter; @@ -63,7 +61,7 @@ public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_POSITION_TAG = "OP"; public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; - @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) protected String intervalsFile = null; @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) @@ -72,7 +70,7 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) protected double MISMATCH_THRESHOLD = 0.15; - @Argument(fullName="output", shortName="O", required=false, doc="Output bam") + @Output(fullName="output", shortName="O", required=false, doc="Output bam") protected String writerFilename = null; @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") @@ -115,15 +113,15 @@ public class IndelRealigner extends ReadWalker { // DEBUGGING OPTIONS FOLLOW @Hidden - @Argument(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") + @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") protected String OUT_INDELS = null; @Hidden - @Argument(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false) protected String OUT_STATS = null; @Hidden - @Argument(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) protected String OUT_SNPS = null; // the intervals input by the user diff --git a/packages/Queue.xml b/packages/Queue.xml index 951064070..c7a6dda72 100644 --- a/packages/Queue.xml +++ b/packages/Queue.xml @@ -11,8 +11,8 @@ - - + + diff --git a/scala/qscript/kshakir/CleanBamFile.scala b/scala/qscript/kshakir/CleanBamFile.scala new file mode 100644 index 000000000..729cea827 --- /dev/null +++ b/scala/qscript/kshakir/CleanBamFile.scala @@ -0,0 +1,143 @@ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +class CleanBamFile extends QScript { + qscript => + + @Argument(doc="gatk jar", shortName="gatk") + var gatkJar: File = _ + + @Argument(doc="fix mates jar", shortName="fixMates") + var fixMatesJar: File = _ + + @Input(doc="Script that can merge text files, for example Sting/shell/mergeText.sh.", shortName="MTS") + var mergeTextScript: String = _ + + @Argument(doc="base name for output files", shortName="base") + var baseName: String = _ + + @Input(doc="reference genome", shortName="R") + var referenceFile: File = _ + + @Input(doc="recalibrated bam", shortName="I") + var recalibratedBam: File = _ + + @Argument(doc="read group blacklist conversion script that can convert firehose outputs to a GATK blacklist file.", shortName="RGBLS") + var readGroupBlackListScript: String = _ + + @Argument(doc="read group blacklist", shortName="RGBL", required=false) + var readGroupBlackList: String = _ + + @Argument(doc="intervals", shortName="L", required=false) + var intervals: File = _ + + @Argument(doc="Script that can split the interval file by contig, for example Sting/python/splitIntervalsByContig.py.", shortName="RTCSS") + var targetCreatorScatterScript: String = _ + + @Argument(doc="RealignerTargetCreator scatter count. " + + "Best if it is either 1 or the number of contigs in the interval list. " + + "If used the compute farm must also be used.", shortName="RTCSC") + var targetCreatorScatterCount = 0 + + @Argument(doc="Script that can split the intervals evenly, for example Sting/shell/splitIntervals.sh.", shortName="IRSS") + var indelRealignerScatterScript: String = _ + + @Argument(doc="IndelRealigner scatter count.", shortName="IRSC") + var indelRealignerScatterCount = 0 + + @Input(doc="dbsnp file", shortName="D") + var dbsnpFile: File = _ + + trait GATKCommonArgs extends CommandLineGATK { + this.jarFile = qscript.gatkJar + this.reference_sequence = qscript.referenceFile + this.intervals = qscript.intervals + this.input_file :+= recalibratedBam + this.cleanupTempDirectories = true + } + + def baseFile(suffix: String) = new File(baseName + suffix) + + def script = { + val blacklistConverter = new CommandLineFunction { + @Output(doc="blacklist file") var blacklistFile: File = _ + def commandLine = readGroupBlackListScript + " " + blacklistFile + " " + readGroupBlackList + } + + if (readGroupBlackList != null) { + blacklistConverter.blacklistFile = baseFile(".blacklist.txt") + add(blacklistConverter) + } + + // -T RealignerTargetCreator -I -R -o .merged.intervals + val targetCreator = new RealignerTargetCreator with GATKCommonArgs + targetCreator.memoryLimit = Some(2) + targetCreator.read_group_black_list :+= blacklistConverter.blacklistFile + targetCreator.out = baseFile(".merged.intervals") + targetCreator.scatterCount = targetCreatorScatterCount + targetCreator.setupScatterFunction = { + case (scatter: IntervalScatterFunction, _) => + scatter.splitIntervalsScript = targetCreatorScatterScript + } + targetCreator.setupGatherFunction = { + case (gather: SimpleTextGatherFunction, _) => + gather.mergeTextScript = mergeTextScript + } + + // -T IndelRealigner -I -R -stats .indel.stats + // -O .unfixed.cleaned.bam -maxInRam 200000 -targetIntervals -D + val realigner = new IndelRealigner with GATKCommonArgs + realigner.memoryLimit = Some(4) + realigner.read_group_black_list :+= blacklistConverter.blacklistFile + realigner.statisticsFileForDebugging = baseFile(".indel.stats") + realigner.maxReadsInRam = Some(200000) + realigner.targetIntervals = targetCreator.out + realigner.DBSNP = dbsnpFile + realigner.scatterCount = indelRealignerScatterCount + + val bamIndex = new BamIndexFunction + + if (realigner.scatterCount > 1) { + realigner.output = baseFile(".cleaned.bam") + // While gathering run fix mates. + realigner.setupScatterFunction = { + case (scatter: IntervalScatterFunction, _) => + scatter.splitIntervalsScript = indelRealignerScatterScript + } + realigner.setupGatherFunction = { + case (gather: PicardBamJarFunction, _) => + gather.memoryLimit = Some(4) + gather.jarFile = fixMatesJar + // Don't pass this AS=true to fix mates! + gather.assumeSorted = None + case (gather: SimpleTextGatherFunction, _) => + gather.mergeTextScript = mergeTextScript + } + + bamIndex.bamFile = realigner.output + } else { + realigner.output = baseFile(".unfixed.cleaned.bam") + + // Explicitly run fix mates if the function won't be scattered. + var fixMates = new PicardBamJarFunction { + // Declare inputs/outputs for dependency tracking. + @Input(doc="unfixed bam") var unfixed: File = _ + @Output(doc="fixed bam") var fixed: File = _ + def inputBams = List(unfixed) + def outputBam = fixed + } + fixMates.memoryLimit = Some(4) + fixMates.jarFile = fixMatesJar + fixMates.unfixed = realigner.output + fixMates.fixed = baseFile(".cleaned.bam") + + bamIndex.bamFile = fixMates.fixed + + // Add the fix mates explicitly + add(fixMates) + } + + add(targetCreator, realigner, bamIndex) + } +} diff --git a/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 1ed32f1c7..b97bf176f 100644 --- a/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -12,7 +12,7 @@ class QSettings { var jobNamePrefix: String = QSettings.processNamePrefix @Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false) - var jobQueue: String = "broad" + var jobQueue: String = _ @Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false) var jobProject: String = "Queue" diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala index 13ce477c4..8a442f6f5 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -1,17 +1,14 @@ package org.broadinstitute.sting.queue.extensions.gatk -import org.broadinstitute.sting.queue.function.JarCommandLineFunction -import org.broadinstitute.sting.commandline.Argument import org.broadinstitute.sting.queue.function.scattergather.GatherFunction +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction /** - * Merges BAM files using Picards MergeSampFiles.jar. + * Merges BAM files using Picards MergeSamFiles.jar. * At the Broad the jar can be found at /seq/software/picard/current/bin/MergeSamFiles.jar. Outside the broad see http://picard.sourceforge.net/") */ -class BamGatherFunction extends GatherFunction with JarCommandLineFunction { - @Argument(doc="Compression level 1-9", required=false) - var compressionLevel: Option[Int] = None - - override def commandLine = super.commandLine + "%s%s%s".format( - optional(" COMPRESSION_LEVEL=", compressionLevel), " AS=true VALIDATION_STRINGENCY=SILENT SO=coordinate OUTPUT=" + originalOutput, repeat(" INPUT=", gatherParts)) +class BamGatherFunction extends GatherFunction with PicardBamJarFunction { + this.assumeSorted = Some(true) + protected def inputBams = gatherParts + protected def outputBam = originalOutput } diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala new file mode 100644 index 000000000..efd861f9b --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.queue.function.JarCommandLineFunction +import java.io.File + +/** + * Wraps a Picard jar that operates on BAM files. + * See http://picard.sourceforge.net/ for more info. + * + * Since the jar files take slightly different arguments + * some values are optional. + */ +trait PicardBamJarFunction extends JarCommandLineFunction { + var validationStringency = "SILENT" + var sortOrder = "coordinate" + var compressionLevel: Option[Int] = None + var maxRecordsInRam: Option[Int] = None + var assumeSorted: Option[Boolean] = None + + protected def inputBams: List[File] + protected def outputBam: File + + override def commandLine = super.commandLine + "%s%s%s".format( + optional(" COMPRESSION_LEVEL=", compressionLevel), optional(" VALIDATION_STRINGENCY=", validationStringency), + optional(" SO=", sortOrder), optional( " MAX_RECORDS_IN_RAM=", maxRecordsInRam), optional(" ASSUME_SORTED=", assumeSorted), + " OUTPUT=" + outputBam, repeat(" INPUT=", inputBams), " TMP_DIR=" + jobTempDir) +}