diff --git a/python/mergeVCFs.py b/python/mergeVCFs.py index 93c30fed6..bca5db31c 100755 --- a/python/mergeVCFs.py +++ b/python/mergeVCFs.py @@ -12,6 +12,9 @@ def main(): parser.add_option("-f", "--f", dest="fai", type='string', default=None, help="FAI file defining the sort order of the VCF") + parser.add_option("-a", "--assumeSorted", dest="assumeSorted", + action='store_true', default=False, + help="If provided, this assumes the input VCF files are themselves sorted, enabling a simple efficent merge") (OPTIONS, args) = parser.parse_args() if len(args) == 0: @@ -21,6 +24,44 @@ def main(): if OPTIONS.fai <> None: order = faiReader.readFAIContigOrdering(OPTIONS.fai) #print 'Order', order + if OPTIONS.assumeSorted: + mergeSort(args, order) + else: + memSort(args, order) + +def cmpVCFRecords(order, r1, r2): + if order <> None: + c1 = order[str(r1.getChrom())] + c2 = order[str(r2.getChrom())] + orderCmp = cmp(c1, c2) + if orderCmp <> 0: + return orderCmp + return cmp(r1.getPos(), r2.getPos()) + +def mergeSort(args, order): + #print 'MergeSort', args, order + header = None + + orderMap = [] + for file in args: + #print file + openedFile = open(file) + for header, record, counter in lines2VCF(openedFile, extendedOutput = True, decodeAll = False): + orderMap.append([record, file]) + break + openedFile.close() + + #print orderMap + sortedOrderMap = sorted(orderMap, key=lambda x: x[0], cmp = lambda r1, r2: cmpVCFRecords(order, r1, r2)) + #print sortedOrderMap + + for headerLine in header: print headerLine + for file in map( lambda x: x[1], sortedOrderMap): + #print file + for record in lines2VCF(open(file), extendedOutput = False, decodeAll = False): + print record.format() + +def memSort(args, order): header = None records = [] for file in args: @@ -28,16 +69,7 @@ def main(): for header, record, counter in lines2VCF(open(file), extendedOutput = True, decodeAll = False): records.append(record) - def cmpVCFRecords(r1, r2): - if order <> None: - c1 = order[str(r1.getChrom())] - c2 = order[str(r2.getChrom())] - orderCmp = cmp(c1, c2) - if orderCmp <> 0: - return orderCmp - return cmp(r1.getPos(), r2.getPos()) - - records.sort(cmpVCFRecords) + records.sort(lambda r1, r2: cmpVCFRecords(order, r1, r2)) for line in formatVCF(header, records): #pass print line