gatk-3.8/python/genomicAnnotatorScripts/MergeTwoTables.py

156 lines
4.7 KiB
Python
Raw Normal View History

import sys
import os
def print_help():
sys.stderr.write("\n" + os.path.split(sys.argv[0])[1] + " [file1] [file2] \n" + \
" Takes two tab-delimited tables and merges them, so that the output is sorted by genomic position.\n" + \
" Both input files must be in AnnotatorInputTable format (http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Data_Formats),\n" + \
" and must have identical headers.\n")
def read_header(file_obj):
for line in file_obj:
line = line[0:-1] # Remove trailing \n
if line.strip() != "" and line[0] != "#":
return line.split("\t")
else:
raise Exception, "Reached the end of the file without finding the header"
if len(sys.argv) != 3:
print_help()
sys.exit(0)
try:
file1 = open(sys.argv[1])
header1 = read_header(file1)
except Exception, e:
sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")
sys.exit(0)
try:
file2 = open(sys.argv[2])
header2 = read_header(file2)
except Exception, e:
sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")
sys.exit(0)
if len(header1) != len(header2):
sys.stderr.write("ERROR: The two files' headers are of different lengths: \n" + str(header1) + "\n" + str(header2) + "\n")
sys.exit(0)
if header1 != header2:
sys.stderr.write("WARNING: The two files' headers are not the same: \nHeader1: " + str(header1) + "\nHeader2: " + str(header2) + "\nUsing header1.\n")
print("\t".join(header1))
def get_chrom(line):
idx = line.find(":")
if idx == -1:
raise Exception, "Invalid file format. No ':' found in line, so couldn't parse chromosome name: " + line
chrom = line[0:idx]
return chrom
# Computes a sort key for chromosome names (UCSC order)
def compute_chrom_key(chr_value):
a = 0
chr_value = chr_value.lower()
if chr_value.count("_random"):
chr_value = chr_value.replace("_random", "")
a = 30 # Offset so that "random" chromosomes go last
chr_value = chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24")
chr_value = chr_value.replace("chr","")
return a + int(chr_value) + 1
def compute_sort_key(line):
idx = line.find('\t')
if idx == -1:
chrpos = line
else:
chrpos = line[0:idx]
idx = chrpos.find(":")
if idx == -1:
return chrpos
chrom = chrpos[0:idx]
pos = chrpos[idx+1:]
idx = pos.find("-")
if idx == -1:
return int(pos)
else:
start = pos[0:idx]
end = pos[idx+1:]
return int(start)
def read_line(file_obj):
try:
line = file_obj.next()[0:-1] # Remove \n
key = compute_sort_key(line)
return (line, key)
except StopIteration:
return (None, None)
except Exception, e:
sys.stderr.write("ERROR: While reading file \"" + sys.argv[1] + "\": " + str(e) + "\n")
sys.exit(0)
# Read the 1st lines of each file
line1, key1 = read_line(file1)
line2, key2 = read_line(file2)
# Do a merge sort
while line1 != None or line2 != None: # Iterate over each chromosome
# Compute the next chromosome
if line1 != None and line2 != None:
chrom1 = get_chrom(line1)
chrom2 = get_chrom(line2)
if compute_chrom_key(chrom1) < compute_chrom_key(chrom2):
current_chrom = chrom1
else:
current_chrom = chrom2
elif line1 != None:
current_chrom = get_chrom(line1)
elif line2 != None:
current_chrom = get_chrom(line2)
# Iterate over lines for that chromosome
while line1 != None and line2 != None and get_chrom(line1) == current_chrom and get_chrom(line2) == current_chrom:
if key2 > key1:
print(line1)
#print("line1 - key1: " + str(key1) + " key2: " + str(key2))
used_line1 = True
else:
#print("line2 - key1: " + str(key1) + " key2: " + str(key2))
print(line2)
used_line1 = False
if used_line1:
line1, key1 = read_line(file1)
else:
line2, key2 = read_line(file2)
# At this point, either line1 or line2 will == None
while line1 != None and get_chrom(line1) == current_chrom:
print(line1)
line1, key1 = read_line(file1)
while line2 != None and get_chrom(line2) == current_chrom:
print(line2)
line2, key2 = read_line(file2)