diff --git a/scala/qscript/core/DataProcessingPipeline.scala b/scala/qscript/core/DataProcessingPipeline.scala index 5b652436d..74b18118c 100755 --- a/scala/qscript/core/DataProcessingPipeline.scala +++ b/scala/qscript/core/DataProcessingPipeline.scala @@ -1,7 +1,6 @@ package core import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction @@ -10,6 +9,7 @@ import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord} import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel +import org.broadinstitute.sting.queue.extensions.picard._ class DataProcessingPipeline extends QScript { @@ -23,26 +23,27 @@ class DataProcessingPipeline extends QScript { @Input(doc="input BAM file - or list of BAM files", fullName="input", shortName="i", required=true) var input: File = _ - @Input(doc="path to GenomeAnalysisTK.jar", fullName="path_to_gatk_jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="path to AnalyzeCovariates.jar", fullName="path_to_ac_jar", shortName="ac", required=true) - var ACJar: File = _ - @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) var R: String = _ - @Input(doc="path to Picard's MarkDuplicates.jar", fullName="path_to_dedup_jar", shortName="dedup", required=false) - var dedupJar: File = new File("/seq/software/picard/current/bin/MarkDuplicates.jar") + @Input(doc="Reference fasta file", fullName="reference", shortName="R", required=true) + var reference: File = _ - @Input(doc="path to Picard's MergeSamFiles.jar", fullName="path_to_merge_jar", shortName="merge", required=false) - var mergeBamJar: File = new File("/seq/software/picard/current/bin/MergeSamFiles.jar") +// @Input(doc="path to GenomeAnalysisTK.jar", fullName="path_to_gatk_jar", shortName="gatk", required=false) +// var GATKjar: File = _ +// +// @Input(doc="path to AnalyzeCovariates.jar", fullName="path_to_ac_jar", shortName="ac", required=false) +// var ACJar: File = _ +// +// @Input(doc="path to Picard's MarkDuplicates.jar", fullName="path_to_dedup_jar", shortName="dedup", required=false) +// var dedupJar: File = _ +// +// @Input(doc="path to Picard's MergeSamFiles.jar", fullName="path_to_merge_jar", shortName="merge", required=false) +// var mergeBamJar: File = _ +// +// @Input(doc="path to Picard's ValidateSamFile.jar", fullName="path_to_validate_jar", shortName="validate", required=false) +// var validateSamJar: File = _ - @Input(doc="path to Picard's ValidateSamFile.jar", fullName="path_to_validate_jar", shortName="validate", required=false) - var validateSamJar: File = new File("/seq/software/picard/current/bin/ValidateSamFile.jar") - - @Input(doc="Reference fasta file", fullName="reference", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") @@ -51,12 +52,9 @@ class DataProcessingPipeline extends QScript { ****************************************************************************/ - @Input(doc="path to Picard's RevertSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_revert_jar", shortName="revert", required=false) - var revertSamJar: File = _ - - @Input(doc="path to Picard's SortSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_sort_jar", shortName="sort", required=false) - var sortSamJar: File = _ - +// @Input(doc="path to Picard's SortSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_sort_jar", shortName="sort", required=false) +// var sortSamJar: File = _ +// @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ @@ -110,6 +108,14 @@ class DataProcessingPipeline extends QScript { * Helper classes and methods ****************************************************************************/ + class ReadGroup (val id: String, + val lb: String, + val pl: String, + val pu: String, + val sm: String, + val cn: String, + val ds: String) + {} // Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that) def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { @@ -123,14 +129,13 @@ class DataProcessingPipeline extends QScript { return false } - // Utility function to merge all bam files of similar samples. Generates on BAM file per sample. + // Utility function to merge all bam files of similar samples. Generates one BAM file per sample. // It uses the sample information on the header of the input BAM files. // // Because the realignment only happens after these scripts are executed, in case you are using // bwa realignment, this function will operate over the original bam files and output over the // (to be realigned) bam files. - def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File] = null): Map[String, File] = { - assert(bamFiles.length == realignedBamFiles.length, "List of orignal files must have the same number of files than the list of realigned bam files: " + bamFiles.length + " / " + realignedBamFiles.length) + def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = { // Creating a table with SAMPLE information from each input BAM file val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] @@ -173,41 +178,42 @@ class DataProcessingPipeline extends QScript { } // Rebuilds the Read Group string to give BWA - def buildReadGroupString(samReader: SAMFileReader): List[String] = { + def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) { val readGroups = samReader.getFileHeader.getReadGroups - var l: List[String] = List() + var index: Int = readGroups.length for (rg <- readGroups) { - l :+= "@RG\t" + - SAMReadGroupRecord.READ_GROUP_ID_TAG + ":" + rg.getReadGroupId + "\t" + - SAMReadGroupRecord.PLATFORM_TAG + ":" + rg.getPlatformUnit + "\t" + - SAMReadGroupRecord.LIBRARY_TAG + ":" + rg.getLibrary + "\t" + - SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG + ":" + rg.getSample + "\t" + - SAMReadGroupRecord.SEQUENCING_CENTER_TAG + ":" + rg.getSequencingCenter + val intermediateInBam: File = if (index == readGroups.length) { inBam } else { swapExt(outBam, ".bam", index+1 + "-rg.bam") } + val intermediateOutBam: File = if (index > 1) {swapExt(outBam, ".bam", index + "-rg.bam") } else { outBam} + val readGroup = new ReadGroup(rg.getReadGroupId, rg.getPlatform, rg.getLibrary, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription) + add(addReadGroup(intermediateInBam, intermediateOutBam, readGroup)) + index = index - 1 } - return l -} + } // Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape). // Returns a list of realigned BAM files. def performAlignment(bams: List[File]): List[File] = { var realignedBams: List[File] = List() + var index = 1 for (bam <- bams) { - val saiFile1 = swapExt(bam, ".bam", "1.sai") - val saiFile2 = swapExt(bam, ".bam", "2.sai") - val realignedSamFile = swapExt(bam, ".bam", ".realigned.sam") - val realignedBamFile = swapExt(bam, ".bam", ".realigned.bam") - val readGroupString = buildReadGroupString(new SAMFileReader(bam)) + val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") + val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") + val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") + val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam") + val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam") if (useBWAse) { add(bwa_aln_se(bam, saiFile1), - bwa_sam_se(bam, saiFile1, realignedSamFile, readGroupString)) + bwa_sam_se(bam, saiFile1, realignedSamFile)) } else { add(bwa_aln_pe(bam, saiFile1, 1), bwa_aln_pe(bam, saiFile2, 2), - bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile, readGroupString)) + bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile)) } add(sortSam(realignedSamFile, realignedBamFile)) - realignedBams :+= realignedBamFile + addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam)) + realignedBams :+= rgRealignedBamFile + index = index + 1 } return realignedBams } @@ -299,13 +305,12 @@ class DataProcessingPipeline extends QScript { /**************************************************************************** - * Classes (Walkers and non-GATK programs) + * Classes (GATK Walkers) ****************************************************************************/ // General arguments to GATK walkers trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.GATKjar this.reference_sequence = qscript.reference this.memoryLimit = 4 this.isIntermediate = true @@ -364,10 +369,12 @@ class DataProcessingPipeline extends QScript { - // Outside tools (not GATK walkers) + /**************************************************************************** + * Classes (non-GATK programs) + ****************************************************************************/ + case class analyzeCovariates (inRecalFile: File, outPath: File) extends AnalyzeCovariates { - this.jarFile = qscript.ACJar this.resources = qscript.R this.recal_file = inRecalFile this.output_dir = outPath.toString @@ -375,73 +382,56 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" } - case class dedup (inBam: File, outBam: File, metricsFile: File) extends PicardBamFunction { - @Input(doc="fixed bam") var clean = inBam - @Output(doc="deduped bam") var deduped = outBam - @Output(doc="deduped bam index") var dedupedIndex = new File(outBam + "bai") - @Output(doc="metrics file") var metrics = metricsFile - override def inputBams = List(clean) - override def outputBam = deduped - override def commandLine = super.commandLine + " M=" + metricsFile - this.sortOrder = null - this.createIndex = true + case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates { + this.input = List(inBam) + this.output = outBam + this.metrics = metricsFile this.memoryLimit = 6 this.isIntermediate = true - this.jarFile = qscript.dedupJar this.analysisName = queueLogDir + outBam + ".dedup" this.jobName = queueLogDir + outBam + ".dedup" } - case class joinBams (inBams: List[File], outBam: File) extends PicardBamFunction { - @Input(doc="input bam list") var join = inBams - @Output(doc="joined bam") var joined = outBam - @Output(doc="joined bam index") var joinedIndex = new File(outBam + "bai") - override def inputBams = join - override def outputBam = joined - override def commandLine = super.commandLine + " CREATE_INDEX=true" - this.jarFile = qscript.mergeBamJar + case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles { + this.input = inBams + this.output = outBam this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".joinBams" this.jobName = queueLogDir + outBam + ".joinBams" } - case class sortSam (inSam: File, outBam: File) extends PicardBamFunction { - @Input(doc="input unsorted sam file") var sam = inSam - @Output(doc="sorted bam") var bam = outBam - @Output(doc="sorted bam index") var bamIndex = new File(outBam + "bai") - override def inputBams = List(sam) - override def outputBam = bam - override def commandLine = super.commandLine + " CREATE_INDEX=true" - this.jarFile = qscript.sortSamJar + case class sortSam (inSam: File, outBam: File) extends SortSam { + this.input = List(inSam) + this.output = outBam this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".sortSam" this.jobName = queueLogDir + outBam + ".sortSam" } - case class validate (inBam: File, outLog: File) extends PicardBamFunction { - @Input(doc="input bam list") var toValidate = inBam - @Output(doc="validation log") var validate = outLog - override def inputBams = List(inBam) - override def outputBam = outLog - override def commandLine = super.commandLine + " VALIDATE_INDEX=true MAX_RECORDS_IN_RAM=100000 MODE=SUMMARY REFERENCE_SEQUENCE=" + qscript.reference - sortOrder = null - this.jarFile = qscript.validateSamJar + case class validate (inBam: File, outLog: File) extends ValidateSamFile { + this.input = List(inBam) + this.output = outLog + this.maxRecordsInRam = 100000 + this.REFERENCE_SEQUENCE = qscript.reference this.isIntermediate = false this.analysisName = queueLogDir + outLog + ".validate" this.jobName = queueLogDir + outLog + ".validate" } - case class revert (inBam: File, outBam: File) extends PicardBamFunction { - @Input(doc="old annotated bam") var oldBam = inBam - @Output(doc="reverted bam") var revertedBam = outBam - @Output(doc="reverted bam index") var revertedBamIndex = new File(outBam + ".bai") - override def inputBams = List(oldBam) - override def outputBam = revertedBam - override def commandLine = super.commandLine + " CREATE_INDEX=true" + + case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups { + this.input = List(inBam) + this.output = outBam + this.RGID = readGroup.id + this.RGCN = readGroup.cn + this.RGDS = readGroup.ds + this.RGLB = readGroup.lb + this.RGPL = readGroup.pl + this.RGPU = readGroup.pu + this.RGSM = readGroup.sm this.isIntermediate = true - this.jarFile = qscript.dedupJar - this.analysisName = queueLogDir + outBam + ".dedup" - this.jobName = queueLogDir + outBam + ".dedup" + this.analysisName = queueLogDir + outBam + ".rg" + this.jobName = queueLogDir + outBam + ".rg" } case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction { @@ -462,30 +452,22 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" } - case class bwa_sam_se (inBam: File, inSai: File, outBam: File, readGroup: List[String]) extends CommandLineFunction { + case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction { @Input(doc="bam file to be aligned") var bam = inBam @Input(doc="bwa alignment index file") var sai = inSai @Output(doc="output aligned bam file") var alignedBam = outBam - var readGroupParameters = "" - for (rg <- readGroup) { - readGroupParameters += " -r \'" + rg + "\'" - } - def commandLine = bwaPath + " samse " + readGroupParameters + " " + reference + " " + sai + " " + bam + " > " + alignedBam + def commandLine = bwaPath + " samse " + reference + " " + sai + " " + bam + " > " + alignedBam this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".bwa_sam_se" this.jobName = queueLogDir + outBam + ".bwa_sam_se" } - case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File, readGroup: List[String]) extends CommandLineFunction { + case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction { @Input(doc="bam file to be aligned") var bam = inBam @Input(doc="bwa alignment index file for 1st mating pair") var sai1 = inSai1 @Input(doc="bwa alignment index file for 2nd mating pair") var sai2 = inSai2 @Output(doc="output aligned bam file") var alignedBam = outBam - var readGroupParameters = "" - for (rg <- readGroup) { - readGroupParameters += " -r \'" + rg + "\'" - } - def commandLine = bwaPath + " sampe " + readGroupParameters + " " + reference + " " + sai1 + " " + sai2 + " " + bam + " " + bam + " > " + alignedBam + def commandLine = bwaPath + " sampe " + reference + " " + sai1 + " " + sai2 + " " + bam + " " + bam + " > " + alignedBam this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".bwa_sam_pe" this.jobName = queueLogDir + outBam + ".bwa_sam_pe" diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala new file mode 100644 index 000000000..2508f5776 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "AddOrReplaceReadGroups" + javaMainClass = "net.sf.picard.sam.AddOrReplaceReadGroups" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = _ + + @Output(doc="The output BAM file with the modified/added read groups", shortName = "output", fullName = "output_bam_file", required = true) + var output: File = _ + + @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) + var outputIndex: File = new File(output + ".bai") + + @Argument(doc="Read group ID", shortName = "id", fullName = "read_group_id", required = true) + var RGID: String = _ + + @Argument(doc = "Read group library", shortName = "lb", fullName = "read_group_library", required = true) + var RGLB: String = _ + + @Argument(doc = "Read group platform (e.g. illumina, solid)", shortName = "pl", fullName ="read_group_platform", required=true) + var RGPL: String = _ + + @Argument(doc = "Read group platform unit (e.g. run barcode)", shortName = "pu", fullName = "read_group_platform_unit", required = true) + var RGPU: String = _ + + @Argument(doc = "Read group sample name", shortName = "sm", fullName = "read_group_sample_name", required = true) + var RGSM: String = _ + + @Argument(doc = "Read group center name", shortName = "cn", fullName = "read_group_center_name", required = false) + var RGCN: String = "" + + @Argument(doc = "Read group description", shortName = "ds", fullName = "read_group_description", required = false) + var RGDS: String = "" + + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(true) + override def commandLine = super.commandLine + + " RGID=" + RGID + + " RGLB=" + RGLB + + " RGPL=" + RGPL + + " RGPU=" + RGPU + + " RGSM=" + RGSM + + conditionalParameter(RGCN != null && !RGCN.isEmpty, " RGCN=" + RGCN) + + conditionalParameter(RGDS != null && !RGDS.isEmpty, " RGDS=" + RGDS) +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala new file mode 100644 index 000000000..6f006ffad --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "MarkDuplicates" + javaMainClass = "net.sf.picard.sam.MarkDuplicates" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = _ + + @Output(doc="The output file to write marked records to", shortName = "output", fullName = "output_bam_file", required = true) + var output: File = _ + + @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) + var outputIndex: File = new File(output + ".bai") + + @Output(doc="File to write duplication metrics to", shortName = "out_metrics", fullName = "output_metrics_file", required = false) + var metrics: File = new File(output + ".metrics") + + @Argument(doc="If true do not write duplicates to the output file instead of writing them with appropriate flags set.", shortName = "remdup", fullName = "remove_duplicates", required = false) + var REMOVE_DUPLICATES: Boolean = false + + @Argument(doc = "Maximum number of file handles to keep open when spilling read ends to disk. Set this number a little lower than the per-process maximum number of file that may be open. This number can be found by executing the 'ulimit -n' command on a Unix system.", shortName = "max_file_handles", fullName ="max_file_handles_for_read_ends_maps", required=false) + var MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: Int = -1; + + @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false) + var SORTING_COLLECTION_SIZE_RATIO: Double = -1 + + override def inputBams = input + override def outputBam = output + this.sortOrder = null + this.createIndex = Some(true) + override def commandLine = super.commandLine + + " M=" + metrics + + conditionalParameter(REMOVE_DUPLICATES, " REMOVE_DUPLICATES=true") + + conditionalParameter(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP > 0, " MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=" + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP.toString) + + conditionalParameter(SORTING_COLLECTION_SIZE_RATIO > 0, " SORTING_COLLECTION_SIZE_RATIO=" + SORTING_COLLECTION_SIZE_RATIO.toString) + + +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala new file mode 100644 index 000000000..a7e74e1b5 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "MergeSamFiles" + javaMainClass = "net.sf.picard.sam.MergeSamFiles" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = _ + + @Output(doc="The output merged BAM file", shortName = "output", fullName = "output_bam_file", required = true) + var output: File = _ + + @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) + var outputIndex: File = new File(output + ".bai") + + @Argument(doc="Merge the seqeunce dictionaries Default value: false. This option can be set to 'null' to clear the default value.", shortName = "merge_dict", fullName = "merge_sequence_dictionaries", required = false) + var MERGE_SEQUENCE_DICTIONARIES: Boolean = false + + @Argument(doc = "Option to enable a simple two-thread producer consumer version of the merge algorithm that uses one thread to read and merge the records from the input files and another thread to encode, compress and write to disk the output file. The threaded version uses about 20% more CPU and decreases runtime by ~20% when writing out a compressed BAM file. ", shortName = "thread", fullName ="use_threading", required=false) + var USE_THREADING: Boolean = false; + + @Argument(doc = "Comments to include in the merged output file's header.", shortName = "com", fullName = "comments", required = false) + var COMMENT: String = "" + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(true) + override def commandLine = super.commandLine + + conditionalParameter(MERGE_SEQUENCE_DICTIONARIES, " MERGE_SEQUENCE_DICTIONARIES=true") + + conditionalParameter(USE_THREADING, " USE_THREADING=true") + + conditionalParameter(COMMENT != null && !COMMENT.isEmpty, " COMMENT=" + COMMENT) + + +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala new file mode 100644 index 000000000..cc26f7471 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "SortSam" + javaMainClass = "net.sf.picard.sam.SortSam" + + @Input(doc="The input SAM or BAM files to sort.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = _ + + @Output(doc="The sorted BAM or SAM output file.", shortName = "output", fullName = "output_bam_file", required = true) + var output: File = _ + + @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) + var outputIndex: File = new File(output + ".bai") + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(true) + override def commandLine = super.commandLine +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala new file mode 100644 index 000000000..726682b89 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import net.sf.picard.sam.ValidateSamFile.Mode + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "ValidateSamFile" + javaMainClass = "net.sf.picard.sam.ValidateSamFile" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = _ + + @Output(doc="Send output to a file instead of stdout", shortName = "output", fullName = "output_file", required = false) + var output: File = _ + + @Argument(doc="Mode of output", shortName = "mode", fullName = "mode_of_output", required = false) + var MODE: Mode = Mode.VERBOSE + + @Argument(doc="List of validation error types to ignore.", shortName = "ignore", fullName = "ignore_error_types", required = false) + var IGNORE: List[String] = _ + + @Argument(doc = "The maximum number of lines output in verbose mode.", shortName = "max", fullName = "max_output", required = false) + var MAX_OUTPUT: Int = 100 + + @Argument(doc = "Reference sequence file, the NM tag check will be skipped if this is missing.", shortName = "ref", fullName ="reference_sequence", required=false) + var REFERENCE_SEQUENCE: File = _ + + @Argument(doc = "If true, only report errors, and ignore warnings.", shortName = "iw", fullName = "ignore_warnings", required = false) + var IGNORE_WARNINGS: Boolean = false + + @Argument(doc = "If true and input is a BAM file with an index file, also validates the index.", shortName = "vi", fullName = "validate_index", required = false) + var VALIDATE_INDEX: Boolean = true + + @Argument(doc = "Whether the SAM or BAM file consists of bisulfite sequenced reads. If so, C->T is not counted as an error in computing the value of the NM tag.", shortName = "bs", fullName = "is_bisulfite_sequenced", required = false) + var IS_BISULFITE_SEQUENCED: Boolean = false + + @Argument(doc = "Relevant for a coordinate-sorted file containing read pairs only. Maximum number of file handles to keep open when spilling mate info to disk. Set this number a little lower than the per-process maximum number of file that may be open. This number can be found by executing the 'ulimit -n' command on a Unix system.", shortName = "max_files", fullName = "max_open_temp_files", required = false) + var MAX_OPEN_TEMP_FILES: Int = 8000 + + this.sortOrder = null + override def inputBams = input + override def outputBam = output + override def commandLine = super.commandLine + + " MODE=" + MODE + + " MAX_OUTPUT=" + MAX_OUTPUT + + " MAX_OPEN_TEMP_FILES=" + MAX_OPEN_TEMP_FILES + + conditionalParameter(!VALIDATE_INDEX, " VALIDATE_INDEX=false") + + conditionalParameter(IGNORE_WARNINGS, " IGNORE_WARNINGS=true") + + conditionalParameter(IS_BISULFITE_SEQUENCED, " IS_BISULFITE_SEQUENCED=true") + + conditionalParameter(IGNORE != null && !IGNORE.isEmpty, repeat(" IGNORE=", IGNORE)) +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 5ea4f4818..2b1abb2d0 100644 --- a/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -71,7 +71,10 @@ trait CommandLineFunction extends QFunction with Logging { */ protected def repeat(prefix: String, params: Traversable[_], suffix: String = "", separator: String = "", format: (String, Any, String) => String = formatValue("%s")) = - params.filter(param => hasValue(param)).map(param => format(prefix, param, suffix)).mkString(separator) + if (params == null) + "" + else + params.filter(param => hasValue(param)).map(param => format(prefix, param, suffix)).mkString(separator) /** * Returns parameter with a prefix/suffix if it is set otherwise returns "". @@ -103,4 +106,15 @@ trait CommandLineFunction extends QFunction with Logging { case x => format.format(x) }) + suffix + /** + * Returns the parameter if the condition is true. Useful for long string of parameters + * @param condition the condition to validate + * @param param the string to be returned in case condition is true + * @return param if condition is true, "" otherwise + */ + protected def conditionalParameter(condition: Boolean, param: String): String = + if (condition == true) + param + else + "" }