import sys
import os


def print_help():
    sys.stderr.write("\n" + os.path.split(sys.argv[0])[1] + " [file1] [file2] \n" + \
        "     Takes two tab-delimited tables and merges them, so that the output is sorted by genomic position.\n" + \
        "     Both input files must be in AnnotatorInputTable format (http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Data_Formats),\n" + \
        "     and must have identical headers.\n")

def read_header(file_obj):       
    for line in file_obj:
        line = line[0:-1] # Remove trailing \n
        if line.strip() != "" and line[0] != "#":
            return  line.split("\t")
    else:
        raise Exception, "Reached the end of the file without finding the header"
            

if len(sys.argv) != 3:
    print_help()
    sys.exit(0)

try:
    file1 = open(sys.argv[1])
    header1 = read_header(file1)
except Exception, e:
    sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")
    sys.exit(0)

try:
    file2 = open(sys.argv[2])
    header2 = read_header(file2)
except e:
    sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")
    sys.exit(0)


if len(header1) != len(header2):
    sys.stderr.write("ERROR: The two files' headers are of different lengths: \n" + str(header1) + "\n" + str(header2) + "\n")
    sys.exit(0)

if header1 != header2:
    sys.stderr.write("WARNING: The two files' headers are of different lengths: \nHeader1: " + str(header1) + "\nHeader2: " + str(header2) + "\nUsing header1.\n")
print("\t".join(header1))


def get_chrom(line):
    idx = line.find(":")
    if idx == -1:
        raise Exception, "Invalid file format. No ':' found in line, so couldn't parse chromosome name: " + line
    chrom = line[0:idx]
    return chrom
    
# Computes a sort key for chromosome names (UCSC order)
def compute_chrom_key(chr_value):
    a = 0
    chr_value = chr_value.lower()

    if chr_value.count("_random"):
        chr_value = chr_value.replace("_random", "")
        a = 30 # Offset so that "random" chromosomes go last                                                                                                                                                                               

    chr_value = chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24")
    chr_value = chr_value.replace("chr","")
    return a + int(chr_value) + 1

def compute_sort_key(line):
    idx = line.find('\t')
    if idx == -1:
        chrpos = line
    else:        
        chrpos = line[0:idx]

    idx = chrpos.find(":")
    if idx == -1:
        return chrpos
    chrom = chrpos[0:idx]
    pos = chrpos[idx+1:]

    idx = pos.find("-")
    if idx == -1:
        return int(pos)
    else:
        start = pos[0:idx]
        end = pos[idx+1:]
        return int(start)


def read_line(file_obj):
    try:
        line = file_obj.next()[0:-1] # Remove \n
        key = compute_sort_key(line)
        return (line, key)
    except StopIteration:
        return (None, None)
    except Exception, e:
        sys.stderr.write("ERROR: While reading file \"" + sys.argv[1] + "\": " + str(e) + "\n")
        sys.exit(0)


# Read the 1st lines of each file        
line1, key1 = read_line(file1)
line2, key2 = read_line(file2)


# Do a merge sort
while line1 != None or line2 != None: # Iterate over each chromosome
    # Compute the next chromosome
    if line1 != None and line2 != None:
        chrom1 = get_chrom(line1)
        chrom2 = get_chrom(line2)
        if compute_chrom_key(chrom1) < compute_chrom_key(chrom2):
            current_chrom = chrom1
        else:
            current_chrom = chrom2
    elif line1 != None:
        current_chrom = get_chrom(line1)
    elif line2 != None:
        current_chrom = get_chrom(line2)

    # Iterate over lines for that chromosome
    while line1 != None and line2 != None and get_chrom(line1) == current_chrom and get_chrom(line2) == current_chrom: 
        
        if key2 > key1:
            print(line1)
            #print("line1 -  key1: " + str(key1)  + " key2: " + str(key2)) 
            used_line1 = True            
        else:
            #print("line2 -  key1: " + str(key1)  + " key2: " + str(key2)) 
            print(line2)
            used_line1 = False
            
        if used_line1:
            line1, key1 = read_line(file1)
        else:
            line2, key2 = read_line(file2)

        
    # At this point, either line1 or line2 will == None
            
    while line1 != None and get_chrom(line1) == current_chrom:
        print(line1)
        line1, key1 = read_line(file1)

    while line2 != None and get_chrom(line2) == current_chrom:
        print(line2)
        line2, key2 = read_line(file2)