Script for concatenating 2 AnnotatorInputTables, and writing the result to standard out. Merge-sorts the 2 tables while concatenating them

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3505 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-08 22:44:16 +00:00 · 2010-06-08 22:44:16 +00:00 · 3c24223d02
parent c2858c8988
commit 3c24223d02
1 changed files with 153 additions and 0 deletions
--- a/python/genomicAnnotatorScripts/MergeTwoTables.py
+++ b/python/genomicAnnotatorScripts/MergeTwoTables.py
@ -0,0 +1,153 @@
+import sys
+import os
+
+
+def print_help():
+    print("\n" + os.path.split(sys.argv[0])[1] + " [file1] [file2] \n" + \
+        "     Takes two tab-delimited tables and merges them, so that the output is sorted by genomic position.\n" + \
+        "     Both input files must be in AnnotatorInputTable format (http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator#Data_Formats),\n" + \
+        "     and must have identical headers.\n")
+
+def read_header(file_obj):       
+    for line in file_obj:
+        line = line[0:-1] # Remove trailing \n
+        if line.strip() != "" and line[0] != "#":
+            return  line.split("\t")
+    else:
+        raise Exception, "Reached the end of the file without finding the header"
+            
+
+
+
+if len(sys.argv) != 3:
+    print_help()
+    sys.exit(0)
+
+try:
+    file1 = open(sys.argv[1])
+    header1 = read_header(file1)
+except Exception, e:
+    print("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e))
+    sys.exit(0)
+
+try:
+    file2 = open(sys.argv[2])
+    header2 = read_header(file2)
+except e:
+    print("ERROR: While reading header from file \"" + sys.argv[1] + "\": " + str(e))
+    sys.exit(0)
+
+
+if header1 != header2:
+    print("ERROR: The two files have different headers: \n" + str(header1) + "\n" + str(header2))
+    sys.exit(0)
+
+print("\t".join(header1))
+
+
+def get_chrom(line):
+    idx = line.find(":")
+    if idx == -1:
+        raise Exception, "Invalid file format. No ':' found in line, so couldn't parse chromosome name: " + line
+    chrom = line[0:idx]
+    return chrom
+    
+# Computes a sort key for chromosome names (UCSC order)
+def compute_chrom_key(chr_value):
+    a = 0
+    chr_value = chr_value.lower()
+
+    if chr_value.count("_random"):
+        chr_value = chr_value.replace("_random", "")
+        a = 30 # Offset so that "random" chromosomes go last                                                                                                                                                                               
+
+    chr_value = chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24")
+    chr_value = chr_value.replace("chr","")
+    return a + int(chr_value) + 1
+
+def compute_sort_key(line):
+    idx = line.find('\t')
+    if idx == -1:
+        chrpos = line
+    else:        
+        chrpos = line[0:idx]
+
+    idx = chrpos.find(":")
+    if idx == -1:
+        return chrpos
+    chrom = chrpos[0:idx]
+    pos = chrpos[idx+1:]
+
+    idx = pos.find("-")
+    if idx == -1:
+        return int(pos)
+    else:
+        start = pos[0:idx]
+        end = pos[idx+1:]
+        return int(start)
+
+
+def read_line(file_obj):
+    try:
+        line = file_obj.next()[0:-1] # Remove \n
+        key = compute_sort_key(line)
+        return (line, key)
+    except StopIteration:
+        return (None, None)
+    except Exception, e:
+        print("ERROR: While reading file \"" + sys.argv[1] + "\": " + str(e))
+        sys.exit(0)
+
+
+# Read the 1st lines of each file        
+line1, key1 = read_line(file1)
+line2, key2 = read_line(file2)
+
+
+# Do a merge sort
+while line1 != None or line2 != None: # Iterate over each chromosome
+    # Compute the next chromosome
+    if line1 != None and line2 != None:
+        chrom1 = get_chrom(line1)
+        chrom2 = get_chrom(line2)
+        if compute_chrom_key(chrom1) < compute_chrom_key(chrom2):
+            current_chrom = chrom1
+        else:
+            current_chrom = chrom2
+    elif line1 != None:
+        current_chrom = get_chrom(line1)
+    elif line2 != None:
+        current_chrom = get_chrom(line2)
+
+    # Iterate over lines for that chromosome
+    while line1 != None and line2 != None and get_chrom(line1) == current_chrom and get_chrom(line2) == current_chrom: 
+        
+        if key2 > key1:
+            print(line1)
+            #print("line1 -  key1: " + str(key1)  + " key2: " + str(key2)) 
+            used_line1 = True            
+        else:
+            #print("line2 -  key1: " + str(key1)  + " key2: " + str(key2)) 
+            print(line2)
+            used_line1 = False
+            
+        if used_line1:
+            line1, key1 = read_line(file1)
+        else:
+            line2, key2 = read_line(file2)
+
+        
+
+    # At this point, either line1 or line2 will == None
+            
+    while line1 != None and get_chrom(line1) == current_chrom:
+        print(line1)
+        line1, key1 = read_line(file1)
+
+    while line2 != None and get_chrom(line2) == current_chrom:
+        print(line2)
+        line2, key2 = read_line(file2)
+        
+
+            
+