more powerful management routines for my pipeline

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3351 348d0f76-0448-11de-a6fe-93d51630548a
2010-05-12 13:37:39 +00:00 · 2010-05-12 13:37:39 +00:00 · d3c33d4b3f
parent 3f07611187
commit d3c33d4b3f
3 changed files with 34 additions and 16 deletions
--- a/python/farm_commands2.py
+++ b/python/farm_commands2.py
@ -109,7 +109,7 @@ def executeJob(job, farm_queue = None, just_print_commands = False, debug = True
        job.jobID = justPrintJobIDCounter
        justPrintJobIDCounter += 1
    elif farm_queue:
-        print 'job.executionString', job.executionString
+        #print 'job.executionString', job.executionString
        result = subprocess.Popen([job.executionString, ""], shell=True, stdout=subprocess.PIPE).communicate()[0]
        p = re.compile('Job <(\d+)> is submitted to queue')
        job.jobID = p.match(result).group(1)
--- a/python/madPipelineUtils.py
+++ b/python/madPipelineUtils.py
@ -17,11 +17,11 @@ GATK_JAR = GATK_STABLE_JAR
 # add to GATK to enable dbSNP aware cleaning
 # -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod

-#hg18 = ['chrM'] + ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
-#b36 = [str(i) for i in range(1,23)] + ['X', 'Y', 'MT']
+hg18 = ['chrM'] + ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
+b36 = [str(i) for i in range(1,23)] + ['X', 'Y', 'MT']

-hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
-b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
+#hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
+#b36 = [str(i) for i in range(1,23)] + ['X', 'Y']

 HG18_TO_B36 = {
    'hg18' : 'b36',
@ -45,11 +45,12 @@ def appendExtension(path, newExt, addExtension = True):
 #    return os.path.join(OPTIONS.dir, s)

 class PipelineArgs:
-    def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g' ):
+    def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ):
        self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
        self.ref = ref
        self.name = name
        self.memory = memory
+        self.excludeChrs = excludeChrs

    def convertToB36(self): 
        return self.ref == 'b36'
@ -71,6 +72,9 @@ class PipelineArgs:
            cmd = hg18args_to_b36(cmd)
        return cmd
        
+    def chrsToSplitBy(self, chrs):
+        return filter(lambda x: x not in self.excludeChrs, chrs)
+        
 # 
 # General features
 #
@ -97,7 +101,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma
        chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
        return chrCmd, chrOutputMap

-    splits = map( makeChrCmd, hg18 )
+    splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) )
    splitCommands = map(lambda x: x[0], splits)

    def mergeCommand1(i):
--- a/python/realignBamByChr.py
+++ b/python/realignBamByChr.py
@ -9,6 +9,7 @@ import faiReader
 import math
 import shutil
 import string
+import picard_utils
 from madPipelineUtils import *

 def main():
@ -42,7 +43,7 @@ def main():
    inputBam, outputRoot = args[1:]
    outputBamList = outputRoot + '.bams.list'
    
-    STAGES = ['targets', 'realign', 'index']
+    STAGES = ['targets', 'realign', 'index', 'merge']
    for stage in stages:
        if stage not in STAGES:
            sys.exit('unknown stage ' + stage)
@ -57,6 +58,7 @@ def main():
        return name in stages

    out = open(outputBamList, 'w')
+    realignInfo = []
    for chr in hg18:
        lastJobs = None
        
@ -66,23 +68,30 @@ def main():
            allJobs.append(newjobs)
            if newjobs != []: 
                lastJobs = newjobs
-            return [], lastJobs
-
-        newJobs = []
+            return lastJobs

        def execStage(name, func, args = [], lastJobs = []):
            if OPTIONS.verbose: print 'Name is', name
            newJobs, results = func(myPipelineArgs, chr, inputBam, outputRoot + '.' + chr, args, lastJobs)
-            if includeStage(name): newJobs, lastJobs = updateNewJobs(newJobs, lastJobs)
-            return newJobs, lastJobs, results
+            if includeStage(name): lastJobs = updateNewJobs(newJobs, lastJobs)
+            return lastJobs, results
            
-        newJobs, lastJobs, intervals = execStage('targets', createTargets)
-        newJobs, lastJobs, realignedBam = execStage('realign', realign, intervals, lastJobs)
+        lastJobs, intervals = execStage('targets', createTargets)
+        realignJobs, realignedBam = execStage('realign', realign, intervals, lastJobs)
+        realignInfo.append([realignJobs, realignedBam])
        # need to merge and then index
-        newJobs, lastJobs, ignore = execStage('index', index, realignedBam, lastJobs)
+        indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
        print >> out, os.path.abspath(realignedBam)
     
    out.close() 
+
+    if 'merge' in stages:
+        realignerJobs = []
+        if realignInfo[0][0] != []:
+            realignerJobs = map(lambda x: x[0][0], realignInfo)
+        mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
+        allJobs.append(mergerJob)
+
    print 'EXECUTING JOBS'
    executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) 

@ -99,5 +108,10 @@ def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
 def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
    return indexBAMFile( myPipelineArgs.name, realignedBam, lastJobs )

+def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
+    print lastJobs
+    cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
+    return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
+
 if __name__ == "__main__":
    main()