2010-04-26 20:34:04 +08:00
|
|
|
from farm_commands2 import *
|
|
|
|
|
import os.path
|
|
|
|
|
import sys
|
|
|
|
|
from optparse import OptionParser
|
|
|
|
|
from datetime import date
|
|
|
|
|
import glob
|
|
|
|
|
import operator
|
|
|
|
|
import faiReader
|
|
|
|
|
import math
|
|
|
|
|
import shutil
|
|
|
|
|
import string
|
2010-05-12 21:37:39 +08:00
|
|
|
import picard_utils
|
2010-04-26 20:34:04 +08:00
|
|
|
from madPipelineUtils import *
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
global OPTIONS
|
|
|
|
|
usage = "usage: %prog [options] stages input.bam outputRoot"
|
|
|
|
|
parser = OptionParser(usage=usage)
|
|
|
|
|
parser.add_option("", "--dry", dest="dry",
|
|
|
|
|
action='store_true', default=False,
|
|
|
|
|
help="If provided, nothing actually gets run, just a dry run")
|
|
|
|
|
parser.add_option("", "--b36", dest="useB36",
|
|
|
|
|
action='store_true', default=False,
|
|
|
|
|
help="If provided, BAM is assumed to aligned to b36 named chromosomes")
|
|
|
|
|
parser.add_option("-v", "--verbose", dest="verbose",
|
|
|
|
|
action='store_true', default=False,
|
|
|
|
|
help="")
|
|
|
|
|
parser.add_option("-n", "--name", dest="name",
|
|
|
|
|
type="string", default="realignBamByChr",
|
|
|
|
|
help="Farm queue to send processing jobs to")
|
|
|
|
|
parser.add_option("-q", "--farm", dest="farmQueue",
|
|
|
|
|
type="string", default=None,
|
|
|
|
|
help="Farm queue to send processing jobs to")
|
|
|
|
|
parser.add_option("-e", "--extraArgs", dest="extraArgs",
|
|
|
|
|
type="string", default=None,
|
|
|
|
|
help="")
|
|
|
|
|
|
|
|
|
|
(OPTIONS, args) = parser.parse_args()
|
|
|
|
|
if len(args) != 3:
|
|
|
|
|
parser.error("incorrect number of arguments")
|
|
|
|
|
|
|
|
|
|
stages = map(string.lower, args[0].split(","))
|
|
|
|
|
inputBam, outputRoot = args[1:]
|
|
|
|
|
outputBamList = outputRoot + '.bams.list'
|
|
|
|
|
|
2010-05-12 21:37:39 +08:00
|
|
|
STAGES = ['targets', 'realign', 'index', 'merge']
|
2010-04-26 20:34:04 +08:00
|
|
|
for stage in stages:
|
|
|
|
|
if stage not in STAGES:
|
|
|
|
|
sys.exit('unknown stage ' + stage)
|
|
|
|
|
|
|
|
|
|
myPipelineArgs = PipelineArgs(name = OPTIONS.name)
|
|
|
|
|
if ( OPTIONS.useB36 ):
|
|
|
|
|
myPipelineArgs.ref = 'b36'
|
|
|
|
|
|
|
|
|
|
allJobs = []
|
|
|
|
|
|
|
|
|
|
def includeStage(name):
|
|
|
|
|
return name in stages
|
|
|
|
|
|
|
|
|
|
out = open(outputBamList, 'w')
|
2010-05-12 21:37:39 +08:00
|
|
|
realignInfo = []
|
2010-04-26 20:34:04 +08:00
|
|
|
for chr in hg18:
|
|
|
|
|
lastJobs = None
|
|
|
|
|
|
|
|
|
|
def updateNewJobs(newjobs, lastJobs):
|
|
|
|
|
if OPTIONS.verbose:
|
|
|
|
|
print 'New jobs', newjobs
|
|
|
|
|
allJobs.append(newjobs)
|
|
|
|
|
if newjobs != []:
|
|
|
|
|
lastJobs = newjobs
|
2010-05-12 21:37:39 +08:00
|
|
|
return lastJobs
|
2010-04-26 20:34:04 +08:00
|
|
|
|
|
|
|
|
def execStage(name, func, args = [], lastJobs = []):
|
|
|
|
|
if OPTIONS.verbose: print 'Name is', name
|
|
|
|
|
newJobs, results = func(myPipelineArgs, chr, inputBam, outputRoot + '.' + chr, args, lastJobs)
|
2010-05-12 21:37:39 +08:00
|
|
|
if includeStage(name): lastJobs = updateNewJobs(newJobs, lastJobs)
|
|
|
|
|
return lastJobs, results
|
2010-04-26 20:34:04 +08:00
|
|
|
|
2010-05-12 21:37:39 +08:00
|
|
|
lastJobs, intervals = execStage('targets', createTargets)
|
|
|
|
|
realignJobs, realignedBam = execStage('realign', realign, intervals, lastJobs)
|
|
|
|
|
realignInfo.append([realignJobs, realignedBam])
|
2010-04-26 20:34:04 +08:00
|
|
|
# need to merge and then index
|
2010-05-12 21:37:39 +08:00
|
|
|
indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
|
2010-04-26 20:34:04 +08:00
|
|
|
print >> out, os.path.abspath(realignedBam)
|
|
|
|
|
|
|
|
|
|
out.close()
|
2010-05-12 21:37:39 +08:00
|
|
|
|
|
|
|
|
if 'merge' in stages:
|
|
|
|
|
realignerJobs = []
|
|
|
|
|
if realignInfo[0][0] != []:
|
|
|
|
|
realignerJobs = map(lambda x: x[0][0], realignInfo)
|
|
|
|
|
mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
|
|
|
|
|
allJobs.append(mergerJob)
|
|
|
|
|
|
2010-04-26 20:34:04 +08:00
|
|
|
print 'EXECUTING JOBS'
|
|
|
|
|
executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
|
|
|
|
|
|
|
|
|
def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
|
|
|
|
|
outputIntervals = outputRoot + ".intervals"
|
2010-05-08 05:34:46 +08:00
|
|
|
GATKArgs = '-T RealignerTargetCreator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
|
2010-04-26 20:34:04 +08:00
|
|
|
return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals
|
|
|
|
|
|
|
|
|
|
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
|
|
|
|
|
outputBAM = outputRoot + ".bam"
|
2010-05-08 05:34:46 +08:00
|
|
|
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
|
2010-04-26 20:34:04 +08:00
|
|
|
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
|
|
|
|
|
|
|
|
|
|
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
|
|
|
|
|
return indexBAMFile( myPipelineArgs.name, realignedBam, lastJobs )
|
|
|
|
|
|
2010-05-12 21:37:39 +08:00
|
|
|
def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
|
|
|
|
|
print lastJobs
|
|
|
|
|
cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
|
|
|
|
|
return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
|
|
|
|
|
|
2010-04-26 20:34:04 +08:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|