Added a qscripts util package with some utility functions commonly shared across queue scripts. Refactored some of my public scripts to use it in an effort to make queue scripts more reusable and "supportable".

This commit is contained in:
Mauricio Carneiro 2011-07-14 17:09:35 -04:00
parent 9f5180ab05
commit 09ffe277ae
3 changed files with 71 additions and 49 deletions

View File

@ -4,13 +4,13 @@ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.function.ListWriterFunction
import scala.io.Source._
import collection.JavaConversions._
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
import org.broadinstitute.sting.queue.extensions.picard._
import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
import net.sf.samtools.{SAMFileReader}
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.qscripts.utils.Utils
class DataProcessingPipeline extends QScript {
qscript =>
@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript {
val ds: String)
{}
// Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that)
def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = {
var sample: String = ""
for (r <- readGroups) {
if (sample.isEmpty)
sample = r.getSample
else if (sample != r.getSample)
return true;
}
return false
}
// Utility function to merge all bam files of similar samples. Generates one BAM file per sample.
// It uses the sample information on the header of the input BAM files.
//
@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript {
// only allow one sample per file. Bam files with multiple samples would require pre-processing of the file
// with PrintReads to separate the samples. Tell user to do it himself!
assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
assert(!Utils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
// Fill out the sample table with the readgroups in this file
for (rg <- readGroups) {
@ -157,12 +145,6 @@ class DataProcessingPipeline extends QScript {
return sampleBamFiles.toMap
}
// Checks how many contigs are in the dataset. Uses the BAM file header information.
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
// Rebuilds the Read Group string to give BWA
def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) {
val readGroups = samReader.getFileHeader.getReadGroups
@ -206,15 +188,7 @@ class DataProcessingPipeline extends QScript {
return realignedBams
}
// Reads a BAM LIST file and creates a scala list with all the files
def createListFromFile(in: File):List[File] = {
if (in.toString.endsWith("bam"))
return List(in)
var l: List[File] = List()
for (bam <- fromFile(in).getLines)
l :+= new File(bam)
return l
}
@ -226,8 +200,8 @@ class DataProcessingPipeline extends QScript {
def script = {
// keep a record of the number of contigs in the first bam file in the list
val bams = createListFromFile(input)
nContigs = getNumberOfContigs(bams(0))
val bams = Utils.createListFromFile(input)
nContigs = Utils.getNumberOfContigs(bams(0))
val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import net.sf.samtools.SAMFileReader
import io.Source._
import org.broadinstitute.sting.queue.qscripts.utils.Utils
/**
* Created by IntelliJ IDEA.
@ -33,25 +34,10 @@ class RecalibrateBaseQualities extends QScript {
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
// Reads a BAM LIST file and creates a scala list with all the files
def createListFromFile(in: File):List[File] = {
if (in.toString.endsWith("bam"))
return List(in)
var l: List[File] = List()
for (bam <- fromFile(in).getLines)
l :+= new File(bam)
return l
}
def script = {
val bamList = createListFromFile(input)
nContigs = getNumberOfContigs(bamList(0))
val bamList = Utils.createListFromFile(input)
nContigs = Utils.getNumberOfContigs(bamList(0))
for (bam <- bamList) {

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.queue.qscripts.utils
import java.io.File
import io.Source._
import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader}
import collection.JavaConversions._
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 7/14/11
* Time: 4:57 PM
* To change this template use File | Settings | File Templates.
*/
object Utils {
/**
* Takes a bam list file and produces a scala list with each file allowing the bam list
* to have empty lines and comment lines (lines starting with #).
*/
def createListFromFile(in: File):List[File] = {
// If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
// and return a list with only this file.
if (in.toString.endsWith(".bam"))
return List(in)
var list: List[File] = List()
for (bam <- fromFile(in).getLines)
if (!bam.startsWith("#") && !bam.isEmpty )
list :+= new File(bam.trim())
list
}
/**
* Returns the number of contigs in the BAM file header.
*/
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
/**
* Check if there are multiple samples in a BAM file
*/
def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = {
var sample: String = ""
for (r <- readGroups) {
if (sample.isEmpty)
sample = r.getSample
else if (sample != r.getSample)
return true;
}
false
}
}