diff --git a/python/SpawnValidationJobs.py b/python/SpawnValidationJobs.py new file mode 100755 index 000000000..4b59cfde5 --- /dev/null +++ b/python/SpawnValidationJobs.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +import getopt, sys, os, string +from farm_commands import * + +def picardCMD(name, **keywords ): + cmd = name + for key, value in keywords.iteritems(): + cmd += ' ' + key + "=" + str(value) + return cmd + +def spawnValidationJob( input_file, output_head, farm, maxErrors): + validate_exe = "ValidateSAM" + output_file = output_head + '.stdout' + + if regenExistingFiles or not os.path.exists(output_file): + cmd_str = picardCMD( validate_exe, I=input_file, M=maxErrors ) + if farm == "": + cmd_str += " > " + output_file + cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) + +def usage(): + print "Required arguments:" + print " -d Directory to grab all sam/bam files from" + print + print "Optional arguments:" + print " -f QUEUE Farm jobs to QUEUE on LSF" + print + print " -m MAXERRORS Maximum number of errors to detect before aborting" + print + + +def get_all_sam_files(dir): + files = [] + + for dirpath, dirnames, filenames in os.walk(dir): + for filename in filenames: + base, ext = os.path.splitext(filename) + if ext.lower() in ['.sam', '.bam']: + files.append( os.path.join( dirpath, filename ) ) + #print filename, base, ext + + return files + +def output_filename( input_file ): + parts = filter(lambda x: x.strip() <> '', input_file.split("/")) + print parts + return ".".join(parts) + ".validation" + +justPrintCommands = False +regenExistingFiles = False + +if __name__ == "__main__": + opts = None + try: + opts, args = getopt.getopt(sys.argv[1:], "d:f:m:r", ["dir","farm","maxErrors", "regenExistingFiles"]) + except getopt.GetoptError: + print sys.argv + usage() + sys.exit(2) + + dir = "" + mapper_str = "all" + farm_sub = False + maxErrors = 1000 + + for opt, arg in opts: + print opt, arg + if opt in ("-d", "--dir"): + dir = arg + if opt in ("-f", "--farm"): + farm_sub = arg + if opt in ("-m", "--maxErrors"): + maxErrors = arg + if opt in ("-r", "--regenExistingFiles"): + regenExistingFiles = True + if dir == "": + usage() + sys.exit(2) + + input_files = get_all_sam_files(dir) + print 'Processing files: N=', len(input_files) + for input_file in input_files: + print ' ->', input_file + + for input_file in input_files: + output_file = output_filename( input_file ) + print input_file, "=>", output_file + spawnValidationJob( input_file, output_file, farm_sub, maxErrors ) + + + \ No newline at end of file