gatk-3.8/python/genomicAnnotatorScripts/MergeTwoTables.py

import sys
import os


def print_help():
    sys.stderr.write("\n" + os.path.split(sys.argv[0])[1] + " [file1] [file2] \n" + \
        "     Takes two tab-delimited tables and merges them, so that the output is sorted by genomic position.\n" + \
        "     Both input files must be in AnnotatorInputTable format (http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Data_Formats),\n" + \
        "     and must have identical headers.\n")

def read_header(file_obj):       
    for line in file_obj:
        line = line[0:-1] # Remove trailing \n
        if line.strip() != "" and line[0] != "#":
            return  line.split("\t")
    else:
        raise Exception, "Reached the end of the file without finding the header"
            

if len(sys.argv) != 3:
    print_help()
    sys.exit(0)

try:
    file1 = open(sys.argv[1])
    header1 = read_header(file1)
except Exception, e:
    sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")
    sys.exit(0)

try:
    file2 = open(sys.argv[2])
    header2 = read_header(file2)
except Exception, e:
    sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")
    sys.exit(0)


if len(header1) != len(header2):
    sys.stderr.write("ERROR: The two files' headers are of different lengths: \n" + str(header1) + "\n" + str(header2) + "\n")
    sys.exit(0)

if header1 != header2:
    sys.stderr.write("WARNING: The two files' headers are not the same: \nHeader1: " + str(header1) + "\nHeader2: " + str(header2) + "\nUsing header1.\n")
print("\t".join(header1))


def get_chrom(line):
    idx = line.find(":")
    if idx == -1:
        raise Exception, "Invalid file format. No ':' found in line, so couldn't parse chromosome name: " + line
    chrom = line[0:idx]
    return chrom
    
# Computes a sort key for chromosome names (UCSC order)
def compute_chrom_key(chr_value):
    a = 0
    chr_value = chr_value.lower()

    if chr_value.count("_random"):
        chr_value = chr_value.replace("_random", "")
        a = 30 # Offset so that "random" chromosomes go last                                                                                                                                                                               

    chr_value = chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24")
    chr_value = chr_value.replace("chr","")
    return a + int(chr_value) + 1

def compute_sort_key(line):
    idx = line.find('\t')
    if idx == -1:
        chrpos = line
    else:        
        chrpos = line[0:idx]

    idx = chrpos.find(":")
    if idx == -1:
        return chrpos
    chrom = chrpos[0:idx]
    pos = chrpos[idx+1:]

    idx = pos.find("-")
    if idx == -1:
        return int(pos)
    else:
        start = pos[0:idx]
        end = pos[idx+1:]
        return int(start)


def read_line(file_obj):
    try:
        line = file_obj.next()[0:-1] # Remove \n
        key = compute_sort_key(line)
        return (line, key)
    except StopIteration:
        return (None, None)
    except Exception, e:
        sys.stderr.write("ERROR: While reading file \"" + sys.argv[1] + "\": " + str(e) + "\n")
        sys.exit(0)


# Read the 1st lines of each file        
line1, key1 = read_line(file1)
line2, key2 = read_line(file2)


# Do a merge sort
while line1 != None or line2 != None: # Iterate over each chromosome
    # Compute the next chromosome
    if line1 != None and line2 != None:
        chrom1 = get_chrom(line1)
        chrom2 = get_chrom(line2)
        if compute_chrom_key(chrom1) < compute_chrom_key(chrom2):
            current_chrom = chrom1
        else:
            current_chrom = chrom2
    elif line1 != None:
        current_chrom = get_chrom(line1)
    elif line2 != None:
        current_chrom = get_chrom(line2)

    # Iterate over lines for that chromosome
    while line1 != None and line2 != None and get_chrom(line1) == current_chrom and get_chrom(line2) == current_chrom: 
        
        if key2 > key1:
            print(line1)
            #print("line1 -  key1: " + str(key1)  + " key2: " + str(key2)) 
            used_line1 = True            
        else:
            #print("line2 -  key1: " + str(key1)  + " key2: " + str(key2)) 
            print(line2)
            used_line1 = False
            
        if used_line1:
            line1, key1 = read_line(file1)
        else:
            line2, key2 = read_line(file2)

        
    # At this point, either line1 or line2 will == None
            
    while line1 != None and get_chrom(line1) == current_chrom:
        print(line1)
        line1, key1 = read_line(file1)

    while line2 != None and get_chrom(line2) == current_chrom:
        print(line2)
        line2, key2 = read_line(file2)
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`import sys`
			`import os`


			`def print_help():`
Write error to stderr git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3514 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-10 01:09:10 +08:00			`sys.stderr.write("\n" + os.path.split(sys.argv[0])[1] + " [file1] [file2] \n" + \`
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`" Takes two tab-delimited tables and merges them, so that the output is sorted by genomic position.\n" + \`
			`" Both input files must be in AnnotatorInputTable format (http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Data_Formats),\n" + \`
			`" and must have identical headers.\n")`

			`def read_header(file_obj):`
			`for line in file_obj:`
			`line = line[0:-1] # Remove trailing \n`
			`if line.strip() != "" and line[0] != "#":`
			`return line.split("\t")`
			`else:`
			`raise Exception, "Reached the end of the file without finding the header"`




			`if len(sys.argv) != 3:`
			`print_help()`
			`sys.exit(0)`

			`try:`
			`file1 = open(sys.argv[1])`
			`header1 = read_header(file1)`
			`except Exception, e:`
Write error to stderr git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3514 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-10 01:09:10 +08:00			`sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")`
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`sys.exit(0)`

			`try:`
			`file2 = open(sys.argv[2])`
			`header2 = read_header(file2)`
Fixed typo git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3561 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-16 05:17:14 +08:00			`except Exception, e:`
Write error to stderr git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3514 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-10 01:09:10 +08:00			`sys.stderr.write("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e) + "\n")`
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`sys.exit(0)`


Write error to stderr git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3514 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-10 01:09:10 +08:00			`if len(header1) != len(header2):`
			`sys.stderr.write("ERROR: The two files' headers are of different lengths: \n" + str(header1) + "\n" + str(header2) + "\n")`
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`sys.exit(0)`

Write error to stderr git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3514 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-10 01:09:10 +08:00			`if header1 != header2:`
Fixed error message git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3653 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-28 22:50:28 +08:00			`sys.stderr.write("WARNING: The two files' headers are not the same: \nHeader1: " + str(header1) + "\nHeader2: " + str(header2) + "\nUsing header1.\n")`
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`print("\t".join(header1))`


			`def get_chrom(line):`
			`idx = line.find(":")`
			`if idx == -1:`
			`raise Exception, "Invalid file format. No ':' found in line, so couldn't parse chromosome name: " + line`
			`chrom = line[0:idx]`
			`return chrom`

			`# Computes a sort key for chromosome names (UCSC order)`
			`def compute_chrom_key(chr_value):`
			`a = 0`
			`chr_value = chr_value.lower()`

			`if chr_value.count("_random"):`
			`chr_value = chr_value.replace("_random", "")`
			`a = 30 # Offset so that "random" chromosomes go last`

			`chr_value = chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24")`
			`chr_value = chr_value.replace("chr","")`
			`return a + int(chr_value) + 1`

			`def compute_sort_key(line):`
			`idx = line.find('\t')`
			`if idx == -1:`
			`chrpos = line`
			`else:`
			`chrpos = line[0:idx]`

			`idx = chrpos.find(":")`
			`if idx == -1:`
			`return chrpos`
			`chrom = chrpos[0:idx]`
			`pos = chrpos[idx+1:]`

			`idx = pos.find("-")`
			`if idx == -1:`
			`return int(pos)`
			`else:`
			`start = pos[0:idx]`
			`end = pos[idx+1:]`
			`return int(start)`


			`def read_line(file_obj):`
			`try:`
			`line = file_obj.next()[0:-1] # Remove \n`
			`key = compute_sort_key(line)`
			`return (line, key)`
			`except StopIteration:`
			`return (None, None)`
			`except Exception, e:`
Write error to stderr git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3514 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-10 01:09:10 +08:00			`sys.stderr.write("ERROR: While reading file \"" + sys.argv[1] + "\": " + str(e) + "\n")`
Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a 2010-06-09 06:44:16 +08:00			`sys.exit(0)`


			`# Read the 1st lines of each file`
			`line1, key1 = read_line(file1)`
			`line2, key2 = read_line(file2)`


			`# Do a merge sort`
			`while line1 != None or line2 != None: # Iterate over each chromosome`
			`# Compute the next chromosome`
			`if line1 != None and line2 != None:`
			`chrom1 = get_chrom(line1)`
			`chrom2 = get_chrom(line2)`
			`if compute_chrom_key(chrom1) < compute_chrom_key(chrom2):`
			`current_chrom = chrom1`
			`else:`
			`current_chrom = chrom2`
			`elif line1 != None:`
			`current_chrom = get_chrom(line1)`
			`elif line2 != None:`
			`current_chrom = get_chrom(line2)`

			`# Iterate over lines for that chromosome`
			`while line1 != None and line2 != None and get_chrom(line1) == current_chrom and get_chrom(line2) == current_chrom:`

			`if key2 > key1:`
			`print(line1)`
			`#print("line1 - key1: " + str(key1) + " key2: " + str(key2))`
			`used_line1 = True`
			`else:`
			`#print("line2 - key1: " + str(key1) + " key2: " + str(key2))`
			`print(line2)`
			`used_line1 = False`

			`if used_line1:`
			`line1, key1 = read_line(file1)`
			`else:`
			`line2, key2 = read_line(file2)`



			`# At this point, either line1 or line2 will == None`

			`while line1 != None and get_chrom(line1) == current_chrom:`
			`print(line1)`
			`line1, key1 = read_line(file1)`

			`while line2 != None and get_chrom(line2) == current_chrom:`
			`print(line2)`
			`line2, key2 = read_line(file2)`