Improving, yet again, the merging of bam files
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@874 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c4cb867d74
commit
ae2eddec2d
|
|
@ -46,6 +46,7 @@ def main():
|
||||||
for geli in gelis:
|
for geli in gelis:
|
||||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||||
dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||||
|
if os.path.exists(dbsnp_matches):
|
||||||
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
|
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
|
||||||
nTotalSnps += int(TOTAL_SNPS)
|
nTotalSnps += int(TOTAL_SNPS)
|
||||||
nNovelSnps += int(NOVEL_SNPS)
|
nNovelSnps += int(NOVEL_SNPS)
|
||||||
|
|
@ -57,8 +58,8 @@ def main():
|
||||||
jobid = None
|
jobid = None
|
||||||
for geli, variantOut in zip(gelis, variantsOut):
|
for geli, variantOut in zip(gelis, variantsOut):
|
||||||
if not os.path.exists(variantOut):
|
if not os.path.exists(variantOut):
|
||||||
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantsOut) )
|
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
|
||||||
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands=False)
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands=False)
|
||||||
|
|
||||||
cmd = ("cat %s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
cmd = ("cat %s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands=False, waitID = jobid)
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands=False, waitID = jobid)
|
||||||
|
|
|
||||||
|
|
@ -7,56 +7,14 @@ import glob
|
||||||
import operator
|
import operator
|
||||||
import ValidateGATK
|
import ValidateGATK
|
||||||
import picard_utils
|
import picard_utils
|
||||||
|
from MergeBAMsUtils import *
|
||||||
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
|
||||||
bam_ext = '.bam'
|
|
||||||
|
|
||||||
def readNAIdMap(NAIdFile):
|
|
||||||
m = dict()
|
|
||||||
for data in [line.split() for line in open(NAIdFile)]:
|
|
||||||
naid, pop = data[0:2]
|
|
||||||
print naid, ' => ', pop
|
|
||||||
assert naid not in m
|
|
||||||
m[naid] = pop
|
|
||||||
print 'Read NAID->population map'
|
|
||||||
print 'Contains', len(m), 'id -> population mappings'
|
|
||||||
print 'Distinct populations:', picard_utils.unique(m.values())
|
|
||||||
return m
|
|
||||||
|
|
||||||
class MergeFilesSpec:
|
|
||||||
def __init__(self, sources, pop, merged_filename_base ):
|
|
||||||
self.sourceFiles = sources
|
|
||||||
self.pop = pop
|
|
||||||
self.merged_filename_base = merged_filename_base
|
|
||||||
|
|
||||||
def sources(self):
|
|
||||||
return self.sourceFiles
|
|
||||||
|
|
||||||
def filename(self):
|
|
||||||
return self.merged_filename_base + '.' + self.pop
|
|
||||||
|
|
||||||
def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
||||||
if NAID2Pop == None:
|
sourcePairs = [[source, source] for source in allSources]
|
||||||
return [MergeFilesSpec(allSources, '', merged_filename_base)]
|
return groupSources(sourcePairs, NAID2Pop, merged_filename_base)
|
||||||
else:
|
|
||||||
specs = dict()
|
|
||||||
for source in allSources:
|
|
||||||
spec = None
|
|
||||||
for naid, pop in NAID2Pop.iteritems():
|
|
||||||
if source.find(naid) <> -1:
|
|
||||||
if pop in specs:
|
|
||||||
spec = specs[pop]
|
|
||||||
else:
|
|
||||||
spec = MergeFilesSpec([], pop, merged_filename_base)
|
|
||||||
specs[pop] = spec
|
|
||||||
#print 'Mapping', source, naid, pop
|
|
||||||
spec.sourceFiles.append(source)
|
|
||||||
if spec == None:
|
|
||||||
sys.exit('File contains an unknown NAID: ' + source)
|
|
||||||
return specs.values()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
usage = "usage: %prog [options]"
|
usage = "usage: %prog files.list [options]"
|
||||||
parser = OptionParser(usage=usage)
|
parser = OptionParser(usage=usage)
|
||||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||||
type="string", default=None,
|
type="string", default=None,
|
||||||
|
|
@ -64,11 +22,14 @@ if __name__ == "__main__":
|
||||||
parser.add_option("-d", "--dir", dest="output_dir",
|
parser.add_option("-d", "--dir", dest="output_dir",
|
||||||
type="string", default="./",
|
type="string", default="./",
|
||||||
help="Output directory")
|
help="Output directory")
|
||||||
|
parser.add_option("", "--dry", dest="dry",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, nothing actually gets run, just a dry run")
|
||||||
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="Ignores already written files, if present")
|
help="Ignores already written files, if present")
|
||||||
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
||||||
type="string", default=MERGE_BIN,
|
type="string", default=None,
|
||||||
help="Path to merge binary")
|
help="Path to merge binary")
|
||||||
parser.add_option("-n", "--naIDPops", dest="NAIDS2POP",
|
parser.add_option("-n", "--naIDPops", dest="NAIDS2POP",
|
||||||
type="string", default=None,
|
type="string", default=None,
|
||||||
|
|
@ -87,34 +48,23 @@ if __name__ == "__main__":
|
||||||
if OPTIONS.NAIDS2POP <> None:
|
if OPTIONS.NAIDS2POP <> None:
|
||||||
NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP)
|
NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP)
|
||||||
|
|
||||||
today = date.today()
|
|
||||||
time_stamp = today.isoformat()
|
|
||||||
|
|
||||||
for line in open(args[0]):
|
for line in open(args[0]):
|
||||||
s = line.split()
|
s = line.split()
|
||||||
if ( s <> [] and s[0] <> '#' ):
|
if ( s <> [] and s[0] <> '#' ):
|
||||||
merged_filename_base = s[0]
|
merged_filename_base = s[0]
|
||||||
allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] )
|
allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] )
|
||||||
print 'Merging info:'
|
print 'Merging info:'
|
||||||
for mergeFilesSpec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
for spec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
||||||
print '-----'
|
spec.setPath(directory)
|
||||||
print ' Population', mergeFilesSpec.pop
|
spec.pprint()
|
||||||
print ' Filename', mergeFilesSpec.filename()
|
|
||||||
print ' N sources', len(mergeFilesSpec.sources())
|
|
||||||
print ' sources', mergeFilesSpec.sources()
|
|
||||||
|
|
||||||
output = os.path.join(directory, mergeFilesSpec.filename() + '.stdout')
|
|
||||||
output_filename = os.path.join(directory, mergeFilesSpec.filename() + bam_ext)
|
|
||||||
output_index = output_filename + ".bai"
|
|
||||||
|
|
||||||
jobid = None
|
jobid = None
|
||||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(output_filename):
|
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
|
||||||
#cmd = 'java -Xmx4096m -jar ' + OPTIONS.mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(sources))
|
output = spec.getMergedBase() + '.stdout'
|
||||||
cmd = picard_utils.mergeBAMCmd(output_filename, mergeFilesSpec.sources(), OPTIONS.mergeBin)
|
cmd = spec.mergeCmd(OPTIONS.mergeBin)
|
||||||
print cmd
|
print cmd
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output)
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
||||||
|
|
||||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(output_index):
|
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
|
||||||
cmd = "samtools index " + output_filename
|
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, '', waitID = jobid, just_print_commands = OPTIONS.dry)
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, waitID = jobid)
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,209 @@
|
||||||
|
import farm_commands
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
from optparse import OptionParser
|
||||||
|
from datetime import date
|
||||||
|
import glob
|
||||||
|
import operator
|
||||||
|
import ValidateGATK
|
||||||
|
import picard_utils
|
||||||
|
import operator
|
||||||
|
|
||||||
|
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
||||||
|
bam_ext = 'bam'
|
||||||
|
bam_index_ext = 'bai'
|
||||||
|
|
||||||
|
def readNAIdMap(NAIdFile):
|
||||||
|
m = dict()
|
||||||
|
for data in [line.split() for line in open(NAIdFile)]:
|
||||||
|
naid, pop = data[0:2]
|
||||||
|
print naid, ' => ', pop
|
||||||
|
assert naid not in m
|
||||||
|
m[naid] = pop
|
||||||
|
print 'Read NAID->population map'
|
||||||
|
print 'Contains', len(m), 'id -> population mappings'
|
||||||
|
print 'Distinct populations:', picard_utils.unique(m.values())
|
||||||
|
return m
|
||||||
|
|
||||||
|
_abbrevs = [
|
||||||
|
(1<<50L, 'P'),
|
||||||
|
(1<<40L, 'T'),
|
||||||
|
(1<<30L, 'G'),
|
||||||
|
(1<<20L, 'M'),
|
||||||
|
(1<<10L, 'k'),
|
||||||
|
(1, '')
|
||||||
|
]
|
||||||
|
|
||||||
|
def greek(size):
|
||||||
|
"""Return a string representing the greek/metric suffix of a size"""
|
||||||
|
for factor, suffix in _abbrevs:
|
||||||
|
if size > factor:
|
||||||
|
break
|
||||||
|
return '%.1f%s' % (float(size)/factor, suffix)
|
||||||
|
|
||||||
|
class MergeFilesSpec:
|
||||||
|
def __init__(self, sources, group, merged_filename_base, path = '' ):
|
||||||
|
self.sourceFiles = sources
|
||||||
|
self.groupName = group
|
||||||
|
self.merged_filename_base = merged_filename_base
|
||||||
|
self.path = ''
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'MergeFilesSpec: ' + self.group()
|
||||||
|
|
||||||
|
def group(self):
|
||||||
|
return self.groupName
|
||||||
|
|
||||||
|
def sources(self):
|
||||||
|
return self.sourceFiles
|
||||||
|
|
||||||
|
def filename(self):
|
||||||
|
if self.merged_filename_base <> None:
|
||||||
|
return self.merged_filename_base + '.' + self.group()
|
||||||
|
else:
|
||||||
|
return self.group()
|
||||||
|
|
||||||
|
def pprint(self):
|
||||||
|
print '--------------------------------------------------------------------------------'
|
||||||
|
print ' Population: ', self.group()
|
||||||
|
print ' Merged filename: ', self.getMergedBAM()
|
||||||
|
print ' N sources: ', len(self.sources())
|
||||||
|
print ' Sources: ', self.sources()
|
||||||
|
print ' Sizes: ', self.sourceSizes(humanReadable=True)
|
||||||
|
print ' Est. merged size: ', greek(reduce(operator.__add__, self.sourceSizes(), 0))
|
||||||
|
|
||||||
|
def setPath(self, path):
|
||||||
|
self.path = path
|
||||||
|
|
||||||
|
def getMergedBase(self):
|
||||||
|
return os.path.join(self.path, self.filename())
|
||||||
|
|
||||||
|
def getMergedBAM(self):
|
||||||
|
return self.getMergedBase() + '.' + bam_ext
|
||||||
|
|
||||||
|
def getMergedBAMIndex(self):
|
||||||
|
return self.getMergedBase() + '.' + bam_ext + '.' + bam_index_ext
|
||||||
|
|
||||||
|
def sourceSizes(self, humanReadable=False):
|
||||||
|
sizes = map( os.path.getsize, self.sources() )
|
||||||
|
if humanReadable:
|
||||||
|
sizes = map(greek, sizes)
|
||||||
|
return sizes
|
||||||
|
|
||||||
|
def mergeCmd(self, mergeBin = None):
|
||||||
|
if mergeBin == None:
|
||||||
|
mergeBin = MERGE_BIN
|
||||||
|
|
||||||
|
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin)
|
||||||
|
|
||||||
|
def getIndexCmd(self):
|
||||||
|
return "samtools index " + self.getMergedBAM()
|
||||||
|
|
||||||
|
# Very general-purpose operation that returns merge specs given two lists of pairs:
|
||||||
|
# The first list, sources, contains superkey / sourceFile pairs.
|
||||||
|
# The second list, groups, contains key / group pairs.
|
||||||
|
#
|
||||||
|
# This function walks over all source pairs, and for each superkey, it
|
||||||
|
# looks for any key within groups contained within superkey. If it finds it,
|
||||||
|
# it associates sourceFile with the merge group in groups.
|
||||||
|
#
|
||||||
|
# The system requires that superkey match one (and only) one key in groups. It also
|
||||||
|
# requires that each group string be unique. The system can handle groups provided
|
||||||
|
# as a list of pairs of a dictionary.
|
||||||
|
#
|
||||||
|
# The function returns a list of MergeFileSpecs, one for each group with
|
||||||
|
# at least one associated sourceFile.
|
||||||
|
#
|
||||||
|
def groupSources(sources, groups, merged_filename_base):
|
||||||
|
if groups == None:
|
||||||
|
return [MergeFilesSpec(map( lambda x: x[1], sources), '', merged_filename_base)]
|
||||||
|
else:
|
||||||
|
specs = dict()
|
||||||
|
if type(groups) == list: groups = dict(groups)
|
||||||
|
|
||||||
|
for superkey, sourceFile in sources:
|
||||||
|
spec = None
|
||||||
|
for key, group in groups.iteritems():
|
||||||
|
#print 'Examining', superkey, key, group
|
||||||
|
if superkey.find(key) <> -1:
|
||||||
|
if group in specs:
|
||||||
|
spec = specs[group]
|
||||||
|
else:
|
||||||
|
spec = MergeFilesSpec([], group, merged_filename_base)
|
||||||
|
specs[group] = spec
|
||||||
|
print 'Mapping', group, key, superkey, sourceFile
|
||||||
|
spec.sourceFiles.append(sourceFile)
|
||||||
|
if spec == None:
|
||||||
|
sys.exit('File contains an unknown superkey: ' + superkey)
|
||||||
|
v = specs.values()
|
||||||
|
v.sort(key = MergeFilesSpec.group)
|
||||||
|
return v
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
class TestMergeBAMsUtils(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
import cStringIO
|
||||||
|
groupsString = """NA10846 ceu CEPH1
|
||||||
|
NA10847 ceu CEPH1
|
||||||
|
NA12144 ceu CEPH1
|
||||||
|
NA12145 ceu CEPH1
|
||||||
|
NA12146 yri CEPH1
|
||||||
|
NA12239 yri CEPH1
|
||||||
|
NA07029 ceu CEPH1
|
||||||
|
NA07019 ceu CEPH1
|
||||||
|
NA06994 ceu CEPH1
|
||||||
|
NA07000 ceu CEPH1
|
||||||
|
NA07022 ceu CEPH1
|
||||||
|
NA07056 ceu CEPH1
|
||||||
|
NA07048 ceu CEPH1
|
||||||
|
NA06991 ceu CEPH1
|
||||||
|
NA07034 ceu CEPH1
|
||||||
|
"""
|
||||||
|
lanesString = """NA10846 30GA9AAXX 1 Paired CEPH 30GA9AAXX.1.observed_genotypes.geli
|
||||||
|
NA10846 30GA9AAXX 6 Paired CEPH 30GA9AAXX.6.observed_genotypes.geli
|
||||||
|
NA10847 30GA9AAXX 7 Paired CEPH 30GA9AAXX.7.observed_genotypes.geli
|
||||||
|
NA12146 30JLTAAXX 2 Paired CEPH 30JLTAAXX.2.observed_genotypes.geli
|
||||||
|
NA12239 30PNVAAXX 1 Paired CEPH 30PNVAAXX.1.observed_genotypes.geli
|
||||||
|
NA12144 30PYMAAXX 1 Paired CEPH 30PYMAAXX.1.observed_genotypes.geli
|
||||||
|
NA12146 30PYMAAXX 7 Paired CEPH 30PYMAAXX.7.observed_genotypes.geli
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.laneIDCounts = dict([["NA10846", 2], ["NA10847", 1], ["NA12146", 2], ["NA12239", 1], ["NA12144", 1]])
|
||||||
|
|
||||||
|
pops = [line.strip() for line in cStringIO.StringIO(groupsString)]
|
||||||
|
lanes = [line.strip() for line in cStringIO.StringIO(lanesString)]
|
||||||
|
|
||||||
|
print pops
|
||||||
|
print lanes
|
||||||
|
|
||||||
|
self.ids2pop = [line.split()[0:2] for line in pops]
|
||||||
|
self.ids2gelis = [[line.split()[0], line.split()[5]] for line in lanes]
|
||||||
|
self.ids2ids = dict([[line.split()[0]] * 2 for line in lanes])
|
||||||
|
|
||||||
|
def testPopGroups(self):
|
||||||
|
specs = groupSources(self.ids2gelis, self.ids2pop, 'foo')
|
||||||
|
print 'Specs', specs
|
||||||
|
self.assertEqual(len(specs), 2)
|
||||||
|
self.assertEqual(specs[0].group(), 'ceu')
|
||||||
|
self.assertEqual(specs[1].group(), 'yri')
|
||||||
|
|
||||||
|
ceu = specs[0]
|
||||||
|
yri = specs[1]
|
||||||
|
#print ceu.sources()
|
||||||
|
self.assertEqual(len(ceu.sources()), 4)
|
||||||
|
self.assertEqual(len(yri.sources()), 3)
|
||||||
|
self.assertEqual(ceu.getMergedBAM(), 'foo.ceu.bam')
|
||||||
|
self.assertEqual(ceu.getMergedBAMIndex(), 'foo.ceu.bam.bai')
|
||||||
|
self.assertEqual(yri.getMergedBAM(), 'foo.yri.bam')
|
||||||
|
self.assertEqual(yri.getMergedBAMIndex(), 'foo.yri.bam.bai')
|
||||||
|
|
||||||
|
def testIDGroups(self):
|
||||||
|
specs = groupSources(self.ids2gelis, self.ids2ids, 'foo')
|
||||||
|
self.assertEqual(len(specs), 5)
|
||||||
|
for spec in specs:
|
||||||
|
print 'Spec', spec
|
||||||
|
self.assertEqual(len(spec.sources()), self.laneIDCounts[spec.group()])
|
||||||
|
self.assertEqual(spec.getMergedBAM(), 'foo.' + spec.group() + '.bam')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
import farm_commands
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
from optparse import OptionParser
|
||||||
|
import picard_utils
|
||||||
|
from MergeBAMsUtils import *
|
||||||
|
|
||||||
|
def splitSourcesByKeys( bams, keys ):
|
||||||
|
keyPairs = [[key, key] for key in keys]
|
||||||
|
keybamPairs = zip(keys, bams)
|
||||||
|
return groupSources(keybamPairs, keyPairs, None)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
usage = """usage: %prog bams.list [options]
|
||||||
|
Merges BAM files by keys from a file of a list of bams.
|
||||||
|
|
||||||
|
bams.list is a whitespace separated file. One column (--keyCol arg) is the key, and another
|
||||||
|
column (--bamCol) is a path to a bam file. This program will group the bam files
|
||||||
|
by key and spawn merge and index jobs to merge all of the files sharing the same key together"""
|
||||||
|
|
||||||
|
parser = OptionParser(usage=usage)
|
||||||
|
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||||
|
type="string", default=None,
|
||||||
|
help="Farm queue to send processing jobs to")
|
||||||
|
parser.add_option("-d", "--dir", dest="output_dir",
|
||||||
|
type="string", default="./",
|
||||||
|
help="Output directory")
|
||||||
|
parser.add_option("", "--dry", dest="dry",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, nothing actually gets run, just a dry run")
|
||||||
|
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="Ignores already written files, if present")
|
||||||
|
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
||||||
|
type="string", default=None,
|
||||||
|
help="Path to merge binary")
|
||||||
|
parser.add_option("", "--keyCol", dest="keyCol",
|
||||||
|
type=int, default=1,
|
||||||
|
help="Column in the list file holding the key")
|
||||||
|
parser.add_option("", "--bamCol", dest="bamCol",
|
||||||
|
type=int, default=2,
|
||||||
|
help="Column in the list file holding the bam file path")
|
||||||
|
parser.add_option("-l", "--link", dest="link",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If true, program will soft link single bam files that don't need merging")
|
||||||
|
|
||||||
|
(OPTIONS, args) = parser.parse_args()
|
||||||
|
if len(args) != 1:
|
||||||
|
parser.error("incorrect number of arguments")
|
||||||
|
|
||||||
|
directory = OPTIONS.output_dir
|
||||||
|
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.mkdir(directory)
|
||||||
|
|
||||||
|
bamsList = [line.strip().split() for line in open(args[0])]
|
||||||
|
keys = map( lambda x: x[OPTIONS.keyCol-1], bamsList )
|
||||||
|
bams = map( lambda x: x[OPTIONS.bamCol-1], bamsList )
|
||||||
|
|
||||||
|
print 'Merging info:'
|
||||||
|
for info in bamsList: print info
|
||||||
|
for spec in splitSourcesByKeys(bams, keys):
|
||||||
|
spec.setPath(directory)
|
||||||
|
spec.pprint()
|
||||||
|
|
||||||
|
jobid = None
|
||||||
|
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
|
||||||
|
output = spec.getMergedBase() + '.stdout'
|
||||||
|
if len(spec.sources()) == 1 and OPTIONS.link:
|
||||||
|
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
|
||||||
|
else:
|
||||||
|
cmd = spec.mergeCmd(OPTIONS.mergeBin)
|
||||||
|
print cmd
|
||||||
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
||||||
|
|
||||||
|
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
|
||||||
|
pass
|
||||||
|
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, '', waitID = jobid, just_print_commands = OPTIONS.dry)
|
||||||
|
|
||||||
Loading…
Reference in New Issue