Changes from James Pirruccello: now can handle differences between UCSC and NCBI tables, properly sorting despite the contig prefix differences (presence or absence of 'chr'), and converts NCBI format to UCSC format for use by the GenomicAnnotator.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4132 348d0f76-0448-11de-a6fe-93d51630548a
2010-08-26 19:02:29 +00:00 · 2010-08-26 19:02:29 +00:00 · 7671502e1b
parent 8931a63588
commit 7671502e1b
1 changed files with 30 additions and 6 deletions
--- a/python/genomicAnnotatorScripts/ConvertTableToAnnotatorRod.py
+++ b/python/genomicAnnotatorScripts/ConvertTableToAnnotatorRod.py
@ -46,7 +46,6 @@ group.add_option("-v", "--verbose", action="store_true", default=False,
 group.add_option("-d", "--delimiter",                                                                        help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].")


-
 parser.add_option_group(group)

 (options, args) = parser.parse_args()
@ -84,7 +83,7 @@ def line_key(line):
 def chrpos_to_n(lsplit):
    # Get chr, pos from line

-    chr_value, start_value = None, None # Init in case of error
+    chr_value, start_value, chr_prefix = None, None, '' # Init in case of error
    try:
        split1 = lsplit[0].split(":") # Get chr:start-stop out of the 1st column.
        chr_value = split1[0].lower().strip()
@ -95,6 +94,11 @@ def chrpos_to_n(lsplit):
        if len(split2) > 1:
            stop_value = split2[1].lower().strip()
            stop_n = long(stop_value)
+        #Become chr_prefix aware
+        if chr_value.count("chr"):
+            chr_prefix = "chr"
+        else:
+            chr_prefix = ""
    except:
        sys.stderr.write("chrom: %s, start: %s. Couldn't parse line: %s \n" % (chr_value, start_value, line))
        raise
@ -106,11 +110,11 @@ def chrpos_to_n(lsplit):
        a = 30 # Offset so that "random" chromosomes go last

    if sequence_build == "UCSC":
-        chr_value = chr_value.replace("chrm", "chr0")
+        chr_value = chr_value.replace(chr_prefix+"m", chr_prefix+"0")
    else:
-        chr_value = chr_value.replace("chrm", "chr25")
+        chr_value = chr_value.replace(chr_prefix+"m", chr_prefix+"25")

-    chr_n = a + int(chr_value.replace("chrx", "chr23").replace("chry", "chr24").replace("chr","")) + 1
+    chr_n = a + int(chr_value.replace(chr_prefix+"x", chr_prefix+"23").replace(chr_prefix+"y", chr_prefix+"24").replace(chr_prefix,"")) + 1

    N = (chr_n * 10L**23) + (start_n * 10L**11) + stop_n # Combine chr, start, stop into a single numeric key for sorting

@ -372,7 +376,24 @@ for line in open(input_filename):
                line_fields[start_column] = str(start_int) # Change the original column in case keep_copy is True

            chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
-
+            #@JAMES@
+            #print(chrpos_value)
+            #Become chr_prefix aware
+            if chrpos_value.count("chr"):
+                chr_prefix = "chr"
+            else:
+                chr_prefix = ""
+                
+            if sequence_build == "UCSC" and chr_prefix == "chr":
+                chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
+            elif sequence_build == "UCSC" and chr_prefix != "chr":
+                chrpos_value = "chr%s:%d" % ( line_fields[chr_column], start_int )
+            elif sequence_build == "NCBI" and chr_prefix == "chr":
+                chrpos_value = "%s:%d".replace("chr","") % ( line_fields[chr_column], start_int )
+            elif sequence_build == "NCBI" and chr_prefix != "chr":
+                chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
+            
+            #/JAMES
            if stop_column:
                try: stop_int = long(line_fields[stop_column])
                except: error("Line #%d, Column %d: stop coordinate value '%s' is not an integer" % (counter, stop_column, line_fields[stop_column]))
@ -485,6 +506,9 @@ for line in data_lines:
        else:
            output_file.write(line[3:] + "\n")
    else:
+        #if sequence_build == "UCSC":
+        #    output_file.write("chr" + line + "\n")
+        #else:
        output_file.write(line + "\n")

 output_file.close()