Validating walker for lots of bam files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@10 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-02-28 17:05:08 +00:00
parent e892c3fd98
commit bd1fadd9fe
1 changed files with 92 additions and 0 deletions

View File

@ -0,0 +1,92 @@
#!/usr/bin/env python
import getopt, sys, os, string
from farm_commands import *
def picardCMD(name, **keywords ):
cmd = name
for key, value in keywords.iteritems():
cmd += ' ' + key + "=" + str(value)
return cmd
def spawnValidationJob( input_file, output_head, farm, maxErrors):
validate_exe = "ValidateSAM"
output_file = output_head + '.stdout'
if regenExistingFiles or not os.path.exists(output_file):
cmd_str = picardCMD( validate_exe, I=input_file, M=maxErrors )
if farm == "":
cmd_str += " > " + output_file
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
def usage():
print "Required arguments:"
print " -d Directory to grab all sam/bam files from"
print
print "Optional arguments:"
print " -f QUEUE Farm jobs to QUEUE on LSF"
print
print " -m MAXERRORS Maximum number of errors to detect before aborting"
print
def get_all_sam_files(dir):
files = []
for dirpath, dirnames, filenames in os.walk(dir):
for filename in filenames:
base, ext = os.path.splitext(filename)
if ext.lower() in ['.sam', '.bam']:
files.append( os.path.join( dirpath, filename ) )
#print filename, base, ext
return files
def output_filename( input_file ):
parts = filter(lambda x: x.strip() <> '', input_file.split("/"))
print parts
return ".".join(parts) + ".validation"
justPrintCommands = False
regenExistingFiles = False
if __name__ == "__main__":
opts = None
try:
opts, args = getopt.getopt(sys.argv[1:], "d:f:m:r", ["dir","farm","maxErrors", "regenExistingFiles"])
except getopt.GetoptError:
print sys.argv
usage()
sys.exit(2)
dir = ""
mapper_str = "all"
farm_sub = False
maxErrors = 1000
for opt, arg in opts:
print opt, arg
if opt in ("-d", "--dir"):
dir = arg
if opt in ("-f", "--farm"):
farm_sub = arg
if opt in ("-m", "--maxErrors"):
maxErrors = arg
if opt in ("-r", "--regenExistingFiles"):
regenExistingFiles = True
if dir == "":
usage()
sys.exit(2)
input_files = get_all_sam_files(dir)
print 'Processing files: N=', len(input_files)
for input_file in input_files:
print ' ->', input_file
for input_file in input_files:
output_file = output_filename( input_file )
print input_file, "=>", output_file
spawnValidationJob( input_file, output_file, farm_sub, maxErrors )