more powerful management routines for my pipeline

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3351 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-05-12 13:37:39 +00:00
parent 3f07611187
commit d3c33d4b3f
3 changed files with 34 additions and 16 deletions

View File

@ -109,7 +109,7 @@ def executeJob(job, farm_queue = None, just_print_commands = False, debug = True
job.jobID = justPrintJobIDCounter
justPrintJobIDCounter += 1
elif farm_queue:
print 'job.executionString', job.executionString
#print 'job.executionString', job.executionString
result = subprocess.Popen([job.executionString, ""], shell=True, stdout=subprocess.PIPE).communicate()[0]
p = re.compile('Job <(\d+)> is submitted to queue')
job.jobID = p.match(result).group(1)

View File

@ -17,11 +17,11 @@ GATK_JAR = GATK_STABLE_JAR
# add to GATK to enable dbSNP aware cleaning
# -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod
#hg18 = ['chrM'] + ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
#b36 = [str(i) for i in range(1,23)] + ['X', 'Y', 'MT']
hg18 = ['chrM'] + ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
b36 = [str(i) for i in range(1,23)] + ['X', 'Y', 'MT']
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
#hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
#b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
HG18_TO_B36 = {
'hg18' : 'b36',
@ -45,11 +45,12 @@ def appendExtension(path, newExt, addExtension = True):
# return os.path.join(OPTIONS.dir, s)
class PipelineArgs:
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g' ):
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ):
self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
self.ref = ref
self.name = name
self.memory = memory
self.excludeChrs = excludeChrs
def convertToB36(self):
return self.ref == 'b36'
@ -71,6 +72,9 @@ class PipelineArgs:
cmd = hg18args_to_b36(cmd)
return cmd
def chrsToSplitBy(self, chrs):
return filter(lambda x: x not in self.excludeChrs, chrs)
#
# General features
#
@ -97,7 +101,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
return chrCmd, chrOutputMap
splits = map( makeChrCmd, hg18 )
splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) )
splitCommands = map(lambda x: x[0], splits)
def mergeCommand1(i):

View File

@ -9,6 +9,7 @@ import faiReader
import math
import shutil
import string
import picard_utils
from madPipelineUtils import *
def main():
@ -42,7 +43,7 @@ def main():
inputBam, outputRoot = args[1:]
outputBamList = outputRoot + '.bams.list'
STAGES = ['targets', 'realign', 'index']
STAGES = ['targets', 'realign', 'index', 'merge']
for stage in stages:
if stage not in STAGES:
sys.exit('unknown stage ' + stage)
@ -57,6 +58,7 @@ def main():
return name in stages
out = open(outputBamList, 'w')
realignInfo = []
for chr in hg18:
lastJobs = None
@ -66,23 +68,30 @@ def main():
allJobs.append(newjobs)
if newjobs != []:
lastJobs = newjobs
return [], lastJobs
newJobs = []
return lastJobs
def execStage(name, func, args = [], lastJobs = []):
if OPTIONS.verbose: print 'Name is', name
newJobs, results = func(myPipelineArgs, chr, inputBam, outputRoot + '.' + chr, args, lastJobs)
if includeStage(name): newJobs, lastJobs = updateNewJobs(newJobs, lastJobs)
return newJobs, lastJobs, results
if includeStage(name): lastJobs = updateNewJobs(newJobs, lastJobs)
return lastJobs, results
newJobs, lastJobs, intervals = execStage('targets', createTargets)
newJobs, lastJobs, realignedBam = execStage('realign', realign, intervals, lastJobs)
lastJobs, intervals = execStage('targets', createTargets)
realignJobs, realignedBam = execStage('realign', realign, intervals, lastJobs)
realignInfo.append([realignJobs, realignedBam])
# need to merge and then index
newJobs, lastJobs, ignore = execStage('index', index, realignedBam, lastJobs)
indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
print >> out, os.path.abspath(realignedBam)
out.close()
if 'merge' in stages:
realignerJobs = []
if realignInfo[0][0] != []:
realignerJobs = map(lambda x: x[0][0], realignInfo)
mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
allJobs.append(mergerJob)
print 'EXECUTING JOBS'
executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
@ -99,5 +108,10 @@ def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
return indexBAMFile( myPipelineArgs.name, realignedBam, lastJobs )
def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
print lastJobs
cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
if __name__ == "__main__":
main()