203 lines
7.2 KiB
Python
Executable File
203 lines
7.2 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
import getopt, sys, os, string
|
|
|
|
FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py"
|
|
|
|
def cmd(cmd_str, farm_queue=False, output_head=""):
|
|
# if farm_queue is non-False, submits to queue, other
|
|
|
|
if farm_queue:
|
|
farm_stdout = output_head+".stdout"
|
|
cmd_str = "bsub -q "+farm_queue+" -o "+farm_stdout+" "+cmd_str #+" TMP_DIR=/wga/scr1/andrewk/tmp"
|
|
print "### Farming via "+cmd_str
|
|
else:
|
|
print "### Executing "+cmd_str
|
|
|
|
if not justPrintCommands:
|
|
# Actually execute the command if we're not just in debugging output mode
|
|
os.system(cmd_str)
|
|
|
|
def isFastaB(filename):
|
|
"""Is the file a fastb file already?"""
|
|
#print os.path.splitext(filename)
|
|
return os.path.splitext(filename)[1] == '.fastb'
|
|
|
|
def readListOfLanes( listFile ):
|
|
"""Simply reads a list of files to process from a file"""
|
|
lines = map( string.split, map( string.strip, open(listFile).readlines() ) )
|
|
return map( lambda x: x[0], lines ), map( lambda x: x[1], lines )
|
|
|
|
|
|
def run_swmerlin(input_file, input_head, farm=""):
|
|
run_merlin(input_file, input_head, farm, sw=True)
|
|
|
|
def run_merlin(input_file, input_head, farm="", sw=False):
|
|
"sw = Merlin Smith-Waterman option"
|
|
if isFastaB(input_file):
|
|
input_fastb = input_file
|
|
else:
|
|
input_fastb = input_head+".fastb"
|
|
if not os.path.exists(input_fastb):
|
|
cmd("Fasta2Fastb IN= "+input_file)
|
|
if sw:
|
|
output_head = input_head+".swmerlin"
|
|
else:
|
|
output_head = input_head+".merlin"
|
|
cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head
|
|
if sw:
|
|
cmd_str += " SW=True"
|
|
cmd(cmd_str, farm, output_head)
|
|
#if farm:
|
|
# farm_stdout = output_head+".stdout"
|
|
# cmd("bsub -q long -o "+farm_stdout+" "+cmd_str)
|
|
#else:
|
|
# cmd(cmd_str)
|
|
|
|
<<<<<<< SpawnMapperJobs.py
|
|
USE_BATCH = True
|
|
|
|
def run_ILT(input_file, input_head, farm=""):
|
|
print 'isFastaB', input_file, isFastaB(input_file)
|
|
=======
|
|
def run_ilt(input_file, input_head, farm=""):
|
|
#print 'isFastaB', input_file, isFastaB(input_file)
|
|
>>>>>>> 1.5
|
|
if isFastaB(input_file):
|
|
input_fastb = input_file
|
|
else:
|
|
input_fastb = input_head+".fastb"
|
|
if not os.path.exists(input_fastb):
|
|
cmd("Fasta2Fastb IN= "+input_file)
|
|
|
|
<<<<<<< SpawnMapperJobs.py
|
|
output_head = input_head+".ILT"
|
|
|
|
if USE_BATCH:
|
|
cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head
|
|
cmd(cmd_str, False, input_head)
|
|
else:
|
|
cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head
|
|
cmd(cmd_str, farm, input_head)
|
|
|
|
=======
|
|
output_head = input_head+".ilt"
|
|
cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head
|
|
cmd(cmd_str, farm, output_head)
|
|
>>>>>>> 1.5
|
|
|
|
def run_MAQ(input_fasta, head, farm=""):
|
|
maq_exe = "/seq/dirseq/maq-0.7.1/maq"
|
|
bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa"
|
|
|
|
fasta = input_fasta
|
|
quals = head+".quals.txt"
|
|
fastq = head+".fastq"
|
|
if not os.path.exists(fastq) :
|
|
cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
|
|
cmd(cmd_str)
|
|
|
|
bfq = head+".bfq"
|
|
if not os.path.exists(bfq):
|
|
cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq )
|
|
|
|
out_head = head+".maq"
|
|
maq_out = out_head+".out.aln.map"
|
|
cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq
|
|
cmd(cmd_str, farm, out_head)
|
|
|
|
def usage():
|
|
print "Required arguments:"
|
|
print " -i Input FASTA head (*.fasta, *.qualb)"
|
|
print " OR"
|
|
print " -d Directory to grab all FASTA files from"
|
|
print " OR"
|
|
print " -l List of FASTA/FASTB files to process"
|
|
print
|
|
print "Optional arguments:"
|
|
print " -f QUEUE Farm jobs to QUEUE on LSF"
|
|
print
|
|
print " -m MAPPER Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)"
|
|
print
|
|
print " -x Don't execute commands, just print them"
|
|
print
|
|
print " -w Output files to current directory (strip path from input file/dir/list"
|
|
print
|
|
|
|
|
|
def get_all_fasta_files(fasta_dir):
|
|
files = os.listdir(fasta_dir)
|
|
if not fasta_dir.endswith("/"): fasta_dir += "/"
|
|
fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0]
|
|
#print fasta_files
|
|
return fasta_files
|
|
|
|
justPrintCommands = False
|
|
|
|
if __name__ == "__main__":
|
|
opts = None
|
|
try:
|
|
opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "outputInWorkingDirectory"])
|
|
except getopt.GetoptError:
|
|
print sys.argv
|
|
usage()
|
|
sys.exit(2)
|
|
|
|
input_head = ""
|
|
fasta_dir = ""
|
|
mapper_str = "all"
|
|
farm_sub = False
|
|
listOfLanes = None
|
|
outputInWorkingDirectory = False
|
|
|
|
for opt, arg in opts:
|
|
print opt, arg
|
|
if opt in ("-i", "--input"):
|
|
input_head = arg
|
|
if opt in ("-l", "--listOfLanes"):
|
|
listOfLanes = arg
|
|
if opt in ("-d", "--fasta_dir"):
|
|
fasta_dir = arg
|
|
if opt in ("-f", "--farm"):
|
|
farm_sub = arg
|
|
if opt in ("-m", "--mapper"):
|
|
mapper_str = arg
|
|
if opt in ("-x", "--dontexe"):
|
|
justPrintCommands = True
|
|
if opt in ("-w", "--outputInWorkingDirectory"):
|
|
outputInWorkingDirectory = True
|
|
|
|
if (input_head == "") and (fasta_dir == "") and (listOfLanes == None):
|
|
print input_head, fasta_dir, listOfLanes
|
|
usage()
|
|
sys.exit(2)
|
|
|
|
# Select function(s) for mapper
|
|
mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ}
|
|
if mapper_str.lower() == "all":
|
|
mapper_list = mapper_func_list.values()
|
|
else:
|
|
mapper_list = [mapper_func_list.get(mapper_str.lower())]
|
|
if mapper_list == [None]:
|
|
sys.exit("Don't know of mapper argument: "+mapper_str)
|
|
|
|
if input_head:
|
|
input_heads = [None]
|
|
input_files = [input_head + 'fasta']
|
|
elif listOfLanes <> None:
|
|
input_heads, input_files = readListOfLanes(listOfLanes)
|
|
else:
|
|
input_files = [file for file in get_all_fasta_files(fasta_dir)]
|
|
input_heads = [None] * len(input_files)
|
|
|
|
for input_file, input_head in zip(input_files, input_heads):
|
|
if input_head == None:
|
|
file_head = os.path.splitext(input_file)[0]
|
|
if outputInWorkingDirectory:
|
|
file_head = os.path.split(file_head)[1]
|
|
else:
|
|
file_head = input_head
|
|
for mapper in mapper_list:
|
|
mapper( input_file, file_head, farm=farm_sub )
|
|
print
|