From 09ffe277ae7d3418f75510cabe65e6c9a8fa8f25 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 14 Jul 2011 17:09:35 -0400 Subject: [PATCH 1/2] Added a qscripts util package with some utility functions commonly shared across queue scripts. Refactored some of my public scripts to use it in an effort to make queue scripts more reusable and "supportable". --- .../qscripts/DataProcessingPipeline.scala | 38 ++---------- .../qscripts/RecalibrateBaseQualities.scala | 20 +----- .../sting/queue/qscripts/utils/Utils.scala | 62 +++++++++++++++++++ 3 files changed, 71 insertions(+), 49 deletions(-) create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index f9369ee3f..e62b1b926 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -4,13 +4,13 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction -import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ -import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import net.sf.samtools.{SAMFileReader} import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.queue.qscripts.utils.Utils class DataProcessingPipeline extends QScript { qscript => @@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript { val ds: String) {} - // Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that) - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { - var sample: String = "" - for (r <- readGroups) { - if (sample.isEmpty) - sample = r.getSample - else if (sample != r.getSample) - return true; - } - return false - } - // Utility function to merge all bam files of similar samples. Generates one BAM file per sample. // It uses the sample information on the header of the input BAM files. // @@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript { // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file // with PrintReads to separate the samples. Tell user to do it himself! - assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) + assert(!Utils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) // Fill out the sample table with the readgroups in this file for (rg <- readGroups) { @@ -157,12 +145,6 @@ class DataProcessingPipeline extends QScript { return sampleBamFiles.toMap } - // Checks how many contigs are in the dataset. Uses the BAM file header information. - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - // Rebuilds the Read Group string to give BWA def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) { val readGroups = samReader.getFileHeader.getReadGroups @@ -206,15 +188,7 @@ class DataProcessingPipeline extends QScript { return realignedBams } - // Reads a BAM LIST file and creates a scala list with all the files - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) - return l - } + @@ -226,8 +200,8 @@ class DataProcessingPipeline extends QScript { def script = { // keep a record of the number of contigs in the first bam file in the list - val bams = createListFromFile(input) - nContigs = getNumberOfContigs(bams(0)) + val bams = Utils.createListFromFile(input) + nContigs = Utils.getNumberOfContigs(bams(0)) val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 88c7f5ff7..b2960f150 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -4,6 +4,7 @@ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import net.sf.samtools.SAMFileReader import io.Source._ +import org.broadinstitute.sting.queue.qscripts.utils.Utils /** * Created by IntelliJ IDEA. @@ -33,25 +34,10 @@ class RecalibrateBaseQualities extends QScript { val queueLogDir: String = ".qlog/" var nContigs: Int = 0 - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - - // Reads a BAM LIST file and creates a scala list with all the files - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) - return l - } - def script = { - val bamList = createListFromFile(input) - nContigs = getNumberOfContigs(bamList(0)) + val bamList = Utils.createListFromFile(input) + nContigs = Utils.getNumberOfContigs(bamList(0)) for (bam <- bamList) { diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala new file mode 100644 index 000000000..ff94744ee --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.queue.qscripts.utils + +import java.io.File +import io.Source._ +import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader} + +import collection.JavaConversions._ + + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/14/11 + * Time: 4:57 PM + * To change this template use File | Settings | File Templates. + */ + +object Utils { + + /** + * Takes a bam list file and produces a scala list with each file allowing the bam list + * to have empty lines and comment lines (lines starting with #). + */ + def createListFromFile(in: File):List[File] = { + // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // and return a list with only this file. + if (in.toString.endsWith(".bam")) + return List(in) + + var list: List[File] = List() + for (bam <- fromFile(in).getLines) + if (!bam.startsWith("#") && !bam.isEmpty ) + list :+= new File(bam.trim()) + list + } + + /** + * Returns the number of contigs in the BAM file header. + */ + + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(new File(bamFile)) + samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + /** + * Check if there are multiple samples in a BAM file + */ + + def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { + var sample: String = "" + for (r <- readGroups) { + if (sample.isEmpty) + sample = r.getSample + else if (sample != r.getSample) + return true; + } + false + } + + +} \ No newline at end of file From 43c6a8565bfe77e4c50317a3e8858b754d4afcba Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 14 Jul 2011 17:10:44 -0400 Subject: [PATCH 2/2] looks better now. --- .../org/broadinstitute/sting/queue/qscripts/utils/Utils.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala index ff94744ee..1189b376c 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala @@ -37,7 +37,6 @@ object Utils { /** * Returns the number of contigs in the BAM file header. */ - def getNumberOfContigs(bamFile: File): Int = { val samReader = new SAMFileReader(new File(bamFile)) samReader.getFileHeader.getSequenceDictionary.getSequences.size() @@ -46,7 +45,6 @@ object Utils { /** * Check if there are multiple samples in a BAM file */ - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { var sample: String = "" for (r <- readGroups) {