diff --git a/build.xml b/build.xml index d640e6339..224bb6417 100644 --- a/build.xml +++ b/build.xml @@ -12,8 +12,8 @@ - - + + @@ -66,8 +66,8 @@ - - + + @@ -113,7 +113,7 @@ - + @@ -143,7 +143,7 @@ - + @@ -211,21 +211,21 @@ - - + + Generating Queue GATK extensions... - + - + - - + + Building Queue GATK extensions... - + @@ -245,7 +245,7 @@ - + @@ -295,21 +295,21 @@ - + + + + + + + - - - - - - @@ -352,12 +352,6 @@ - - - - - - @@ -556,7 +550,7 @@ - + @@ -564,14 +558,11 @@ - + + + - - - - - diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index fe8821a61..33e8d3a22 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -171,7 +171,7 @@ public class GATKArgumentCollection { public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; @ElementList(required = false) - @Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 153138768..ac8a1a384 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -28,7 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.*; import net.sf.samtools.util.StringUtil; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -44,8 +44,6 @@ import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineUtils; import java.io.File; import java.io.FileWriter; @@ -63,7 +61,7 @@ public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_POSITION_TAG = "OP"; public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; - @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) protected String intervalsFile = null; @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) @@ -72,7 +70,7 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) protected double MISMATCH_THRESHOLD = 0.15; - @Argument(fullName="output", shortName="O", required=false, doc="Output bam") + @Output(fullName="output", shortName="O", required=false, doc="Output bam") protected String writerFilename = null; @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") @@ -115,15 +113,15 @@ public class IndelRealigner extends ReadWalker { // DEBUGGING OPTIONS FOLLOW @Hidden - @Argument(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") + @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") protected String OUT_INDELS = null; @Hidden - @Argument(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false) protected String OUT_STATS = null; @Hidden - @Argument(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) protected String OUT_SNPS = null; // the intervals input by the user diff --git a/packages/Queue.xml b/packages/Queue.xml index 951064070..c7a6dda72 100644 --- a/packages/Queue.xml +++ b/packages/Queue.xml @@ -11,8 +11,8 @@ - - + + diff --git a/scala/qscript/kshakir/CleanBamFile.scala b/scala/qscript/kshakir/CleanBamFile.scala new file mode 100644 index 000000000..729cea827 --- /dev/null +++ b/scala/qscript/kshakir/CleanBamFile.scala @@ -0,0 +1,143 @@ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +class CleanBamFile extends QScript { + qscript => + + @Argument(doc="gatk jar", shortName="gatk") + var gatkJar: File = _ + + @Argument(doc="fix mates jar", shortName="fixMates") + var fixMatesJar: File = _ + + @Input(doc="Script that can merge text files, for example Sting/shell/mergeText.sh.", shortName="MTS") + var mergeTextScript: String = _ + + @Argument(doc="base name for output files", shortName="base") + var baseName: String = _ + + @Input(doc="reference genome", shortName="R") + var referenceFile: File = _ + + @Input(doc="recalibrated bam", shortName="I") + var recalibratedBam: File = _ + + @Argument(doc="read group blacklist conversion script that can convert firehose outputs to a GATK blacklist file.", shortName="RGBLS") + var readGroupBlackListScript: String = _ + + @Argument(doc="read group blacklist", shortName="RGBL", required=false) + var readGroupBlackList: String = _ + + @Argument(doc="intervals", shortName="L", required=false) + var intervals: File = _ + + @Argument(doc="Script that can split the interval file by contig, for example Sting/python/splitIntervalsByContig.py.", shortName="RTCSS") + var targetCreatorScatterScript: String = _ + + @Argument(doc="RealignerTargetCreator scatter count. " + + "Best if it is either 1 or the number of contigs in the interval list. " + + "If used the compute farm must also be used.", shortName="RTCSC") + var targetCreatorScatterCount = 0 + + @Argument(doc="Script that can split the intervals evenly, for example Sting/shell/splitIntervals.sh.", shortName="IRSS") + var indelRealignerScatterScript: String = _ + + @Argument(doc="IndelRealigner scatter count.", shortName="IRSC") + var indelRealignerScatterCount = 0 + + @Input(doc="dbsnp file", shortName="D") + var dbsnpFile: File = _ + + trait GATKCommonArgs extends CommandLineGATK { + this.jarFile = qscript.gatkJar + this.reference_sequence = qscript.referenceFile + this.intervals = qscript.intervals + this.input_file :+= recalibratedBam + this.cleanupTempDirectories = true + } + + def baseFile(suffix: String) = new File(baseName + suffix) + + def script = { + val blacklistConverter = new CommandLineFunction { + @Output(doc="blacklist file") var blacklistFile: File = _ + def commandLine = readGroupBlackListScript + " " + blacklistFile + " " + readGroupBlackList + } + + if (readGroupBlackList != null) { + blacklistConverter.blacklistFile = baseFile(".blacklist.txt") + add(blacklistConverter) + } + + // -T RealignerTargetCreator -I -R -o .merged.intervals + val targetCreator = new RealignerTargetCreator with GATKCommonArgs + targetCreator.memoryLimit = Some(2) + targetCreator.read_group_black_list :+= blacklistConverter.blacklistFile + targetCreator.out = baseFile(".merged.intervals") + targetCreator.scatterCount = targetCreatorScatterCount + targetCreator.setupScatterFunction = { + case (scatter: IntervalScatterFunction, _) => + scatter.splitIntervalsScript = targetCreatorScatterScript + } + targetCreator.setupGatherFunction = { + case (gather: SimpleTextGatherFunction, _) => + gather.mergeTextScript = mergeTextScript + } + + // -T IndelRealigner -I -R -stats .indel.stats + // -O .unfixed.cleaned.bam -maxInRam 200000 -targetIntervals -D + val realigner = new IndelRealigner with GATKCommonArgs + realigner.memoryLimit = Some(4) + realigner.read_group_black_list :+= blacklistConverter.blacklistFile + realigner.statisticsFileForDebugging = baseFile(".indel.stats") + realigner.maxReadsInRam = Some(200000) + realigner.targetIntervals = targetCreator.out + realigner.DBSNP = dbsnpFile + realigner.scatterCount = indelRealignerScatterCount + + val bamIndex = new BamIndexFunction + + if (realigner.scatterCount > 1) { + realigner.output = baseFile(".cleaned.bam") + // While gathering run fix mates. + realigner.setupScatterFunction = { + case (scatter: IntervalScatterFunction, _) => + scatter.splitIntervalsScript = indelRealignerScatterScript + } + realigner.setupGatherFunction = { + case (gather: PicardBamJarFunction, _) => + gather.memoryLimit = Some(4) + gather.jarFile = fixMatesJar + // Don't pass this AS=true to fix mates! + gather.assumeSorted = None + case (gather: SimpleTextGatherFunction, _) => + gather.mergeTextScript = mergeTextScript + } + + bamIndex.bamFile = realigner.output + } else { + realigner.output = baseFile(".unfixed.cleaned.bam") + + // Explicitly run fix mates if the function won't be scattered. + var fixMates = new PicardBamJarFunction { + // Declare inputs/outputs for dependency tracking. + @Input(doc="unfixed bam") var unfixed: File = _ + @Output(doc="fixed bam") var fixed: File = _ + def inputBams = List(unfixed) + def outputBam = fixed + } + fixMates.memoryLimit = Some(4) + fixMates.jarFile = fixMatesJar + fixMates.unfixed = realigner.output + fixMates.fixed = baseFile(".cleaned.bam") + + bamIndex.bamFile = fixMates.fixed + + // Add the fix mates explicitly + add(fixMates) + } + + add(targetCreator, realigner, bamIndex) + } +} diff --git a/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 1ed32f1c7..b97bf176f 100644 --- a/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -12,7 +12,7 @@ class QSettings { var jobNamePrefix: String = QSettings.processNamePrefix @Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false) - var jobQueue: String = "broad" + var jobQueue: String = _ @Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false) var jobProject: String = "Queue" diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala index 13ce477c4..8a442f6f5 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -1,17 +1,14 @@ package org.broadinstitute.sting.queue.extensions.gatk -import org.broadinstitute.sting.queue.function.JarCommandLineFunction -import org.broadinstitute.sting.commandline.Argument import org.broadinstitute.sting.queue.function.scattergather.GatherFunction +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction /** - * Merges BAM files using Picards MergeSampFiles.jar. + * Merges BAM files using Picards MergeSamFiles.jar. * At the Broad the jar can be found at /seq/software/picard/current/bin/MergeSamFiles.jar. Outside the broad see http://picard.sourceforge.net/") */ -class BamGatherFunction extends GatherFunction with JarCommandLineFunction { - @Argument(doc="Compression level 1-9", required=false) - var compressionLevel: Option[Int] = None - - override def commandLine = super.commandLine + "%s%s%s".format( - optional(" COMPRESSION_LEVEL=", compressionLevel), " AS=true VALIDATION_STRINGENCY=SILENT SO=coordinate OUTPUT=" + originalOutput, repeat(" INPUT=", gatherParts)) +class BamGatherFunction extends GatherFunction with PicardBamJarFunction { + this.assumeSorted = Some(true) + protected def inputBams = gatherParts + protected def outputBam = originalOutput } diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala new file mode 100644 index 000000000..efd861f9b --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamJarFunction.scala @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.queue.function.JarCommandLineFunction +import java.io.File + +/** + * Wraps a Picard jar that operates on BAM files. + * See http://picard.sourceforge.net/ for more info. + * + * Since the jar files take slightly different arguments + * some values are optional. + */ +trait PicardBamJarFunction extends JarCommandLineFunction { + var validationStringency = "SILENT" + var sortOrder = "coordinate" + var compressionLevel: Option[Int] = None + var maxRecordsInRam: Option[Int] = None + var assumeSorted: Option[Boolean] = None + + protected def inputBams: List[File] + protected def outputBam: File + + override def commandLine = super.commandLine + "%s%s%s".format( + optional(" COMPRESSION_LEVEL=", compressionLevel), optional(" VALIDATION_STRINGENCY=", validationStringency), + optional(" SO=", sortOrder), optional( " MAX_RECORDS_IN_RAM=", maxRecordsInRam), optional(" ASSUME_SORTED=", assumeSorted), + " OUTPUT=" + outputBam, repeat(" INPUT=", inputBams), " TMP_DIR=" + jobTempDir) +}