2009-04-10 04:10:41 +08:00
|
|
|
import farm_commands
|
|
|
|
|
import os.path
|
|
|
|
|
import sys
|
|
|
|
|
from optparse import OptionParser
|
|
|
|
|
from datetime import date
|
|
|
|
|
import glob
|
|
|
|
|
import operator
|
2009-04-23 03:39:07 +08:00
|
|
|
import ValidateGATK
|
2009-05-31 23:28:44 +08:00
|
|
|
import picard_utils
|
2009-04-10 04:10:41 +08:00
|
|
|
|
|
|
|
|
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
|
|
|
|
bam_ext = '.bam'
|
|
|
|
|
|
2009-05-31 23:28:44 +08:00
|
|
|
def readNAIdMap(NAIdFile):
|
|
|
|
|
m = dict()
|
|
|
|
|
for data in [line.split() for line in open(NAIdFile)]:
|
|
|
|
|
naid, pop = data[0:2]
|
|
|
|
|
print naid, ' => ', pop
|
|
|
|
|
assert naid not in m
|
|
|
|
|
m[naid] = pop
|
|
|
|
|
print 'Read NAID->population map'
|
|
|
|
|
print 'Contains', len(m), 'id -> population mappings'
|
|
|
|
|
print 'Distinct populations:', picard_utils.unique(m.values())
|
|
|
|
|
return m
|
|
|
|
|
|
|
|
|
|
class MergeFilesSpec:
|
|
|
|
|
def __init__(self, sources, pop, merged_filename_base ):
|
|
|
|
|
self.sourceFiles = sources
|
|
|
|
|
self.pop = pop
|
|
|
|
|
self.merged_filename_base = merged_filename_base
|
|
|
|
|
|
|
|
|
|
def sources(self):
|
|
|
|
|
return self.sourceFiles
|
|
|
|
|
|
|
|
|
|
def filename(self):
|
|
|
|
|
return self.merged_filename_base + '.' + self.pop
|
|
|
|
|
|
|
|
|
|
def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
|
|
|
|
if NAID2Pop == None:
|
|
|
|
|
return [MergeFilesSpec(allSources, '', merged_filename_base)]
|
|
|
|
|
else:
|
|
|
|
|
specs = dict()
|
|
|
|
|
for source in allSources:
|
|
|
|
|
spec = None
|
|
|
|
|
for naid, pop in NAID2Pop.iteritems():
|
|
|
|
|
if source.find(naid) <> -1:
|
|
|
|
|
if pop in specs:
|
|
|
|
|
spec = specs[pop]
|
|
|
|
|
else:
|
|
|
|
|
spec = MergeFilesSpec([], pop, merged_filename_base)
|
|
|
|
|
specs[pop] = spec
|
|
|
|
|
#print 'Mapping', source, naid, pop
|
|
|
|
|
spec.sourceFiles.append(source)
|
|
|
|
|
if spec == None:
|
|
|
|
|
sys.exit('File contains an unknown NAID: ' + source)
|
|
|
|
|
return specs.values()
|
|
|
|
|
|
2009-04-10 04:10:41 +08:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
usage = "usage: %prog [options]"
|
|
|
|
|
parser = OptionParser(usage=usage)
|
2009-05-20 20:54:41 +08:00
|
|
|
parser.add_option("-q", "--farm", dest="farmQueue",
|
2009-04-10 04:10:41 +08:00
|
|
|
type="string", default=None,
|
|
|
|
|
help="Farm queue to send processing jobs to")
|
|
|
|
|
parser.add_option("-d", "--dir", dest="output_dir",
|
|
|
|
|
type="string", default="./",
|
|
|
|
|
help="Output directory")
|
|
|
|
|
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
|
|
|
|
action='store_true', default=False,
|
|
|
|
|
help="Ignores already written files, if present")
|
2009-05-22 06:26:19 +08:00
|
|
|
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
|
|
|
|
type="string", default=MERGE_BIN,
|
|
|
|
|
help="Path to merge binary")
|
2009-05-31 23:28:44 +08:00
|
|
|
parser.add_option("-n", "--naIDPops", dest="NAIDS2POP",
|
|
|
|
|
type="string", default=None,
|
|
|
|
|
help="Path to file contains NAID POP names. If provided, input files will be merged by population")
|
|
|
|
|
|
2009-04-10 04:10:41 +08:00
|
|
|
(OPTIONS, args) = parser.parse_args()
|
|
|
|
|
if len(args) != 1:
|
|
|
|
|
parser.error("incorrect number of arguments")
|
|
|
|
|
|
|
|
|
|
directory = OPTIONS.output_dir
|
2009-05-19 05:18:51 +08:00
|
|
|
|
|
|
|
|
if not os.path.exists(directory):
|
|
|
|
|
os.mkdir(directory)
|
|
|
|
|
|
2009-05-31 23:28:44 +08:00
|
|
|
NAID2Pop = None
|
|
|
|
|
if OPTIONS.NAIDS2POP <> None:
|
|
|
|
|
NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP)
|
|
|
|
|
|
2009-04-10 04:10:41 +08:00
|
|
|
today = date.today()
|
|
|
|
|
time_stamp = today.isoformat()
|
|
|
|
|
|
|
|
|
|
for line in open(args[0]):
|
|
|
|
|
s = line.split()
|
|
|
|
|
if ( s <> [] and s[0] <> '#' ):
|
2009-05-31 23:28:44 +08:00
|
|
|
merged_filename_base = s[0]
|
|
|
|
|
allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] )
|
|
|
|
|
print 'Merging info:'
|
|
|
|
|
for mergeFilesSpec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
|
|
|
|
print '-----'
|
|
|
|
|
print ' Population', mergeFilesSpec.pop
|
|
|
|
|
print ' Filename', mergeFilesSpec.filename()
|
|
|
|
|
print ' N sources', len(mergeFilesSpec.sources())
|
|
|
|
|
print ' sources', mergeFilesSpec.sources()
|
|
|
|
|
|
|
|
|
|
output = os.path.join(directory, mergeFilesSpec.filename() + '.stdout')
|
|
|
|
|
output_filename = os.path.join(directory, mergeFilesSpec.filename() + bam_ext)
|
|
|
|
|
output_index = output_filename + ".bai"
|
|
|
|
|
|
|
|
|
|
jobid = None
|
|
|
|
|
if OPTIONS.ignoreExistingFiles or not os.path.exists(output_filename):
|
|
|
|
|
#cmd = 'java -Xmx4096m -jar ' + OPTIONS.mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(sources))
|
|
|
|
|
cmd = picard_utils.mergeBAMCmd(output_filename, mergeFilesSpec.sources(), OPTIONS.mergeBin)
|
|
|
|
|
print cmd
|
|
|
|
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output)
|
|
|
|
|
|
|
|
|
|
if OPTIONS.ignoreExistingFiles or not os.path.exists(output_index):
|
|
|
|
|
cmd = "samtools index " + output_filename
|
|
|
|
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, waitID = jobid)
|
2009-04-10 04:10:41 +08:00
|
|
|
|