Changes from James Pirruccello: now can handle differences between UCSC and NCBI tables, properly sorting despite the contig prefix differences (presence or absence of 'chr'), and converts NCBI format to UCSC format for use by the GenomicAnnotator.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4132 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8931a63588
commit
7671502e1b
|
|
@ -46,7 +46,6 @@ group.add_option("-v", "--verbose", action="store_true", default=False,
|
|||
group.add_option("-d", "--delimiter", help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].")
|
||||
|
||||
|
||||
|
||||
parser.add_option_group(group)
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
|
@ -84,7 +83,7 @@ def line_key(line):
|
|||
def chrpos_to_n(lsplit):
|
||||
# Get chr, pos from line
|
||||
|
||||
chr_value, start_value = None, None # Init in case of error
|
||||
chr_value, start_value, chr_prefix = None, None, '' # Init in case of error
|
||||
try:
|
||||
split1 = lsplit[0].split(":") # Get chr:start-stop out of the 1st column.
|
||||
chr_value = split1[0].lower().strip()
|
||||
|
|
@ -95,6 +94,11 @@ def chrpos_to_n(lsplit):
|
|||
if len(split2) > 1:
|
||||
stop_value = split2[1].lower().strip()
|
||||
stop_n = long(stop_value)
|
||||
#Become chr_prefix aware
|
||||
if chr_value.count("chr"):
|
||||
chr_prefix = "chr"
|
||||
else:
|
||||
chr_prefix = ""
|
||||
except:
|
||||
sys.stderr.write("chrom: %s, start: %s. Couldn't parse line: %s \n" % (chr_value, start_value, line))
|
||||
raise
|
||||
|
|
@ -106,11 +110,11 @@ def chrpos_to_n(lsplit):
|
|||
a = 30 # Offset so that "random" chromosomes go last
|
||||
|
||||
if sequence_build == "UCSC":
|
||||
chr_value = chr_value.replace("chrm", "chr0")
|
||||
chr_value = chr_value.replace(chr_prefix+"m", chr_prefix+"0")
|
||||
else:
|
||||
chr_value = chr_value.replace("chrm", "chr25")
|
||||
chr_value = chr_value.replace(chr_prefix+"m", chr_prefix+"25")
|
||||
|
||||
chr_n = a + int(chr_value.replace("chrx", "chr23").replace("chry", "chr24").replace("chr","")) + 1
|
||||
chr_n = a + int(chr_value.replace(chr_prefix+"x", chr_prefix+"23").replace(chr_prefix+"y", chr_prefix+"24").replace(chr_prefix,"")) + 1
|
||||
|
||||
N = (chr_n * 10L**23) + (start_n * 10L**11) + stop_n # Combine chr, start, stop into a single numeric key for sorting
|
||||
|
||||
|
|
@ -372,7 +376,24 @@ for line in open(input_filename):
|
|||
line_fields[start_column] = str(start_int) # Change the original column in case keep_copy is True
|
||||
|
||||
chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
|
||||
|
||||
#@JAMES@
|
||||
#print(chrpos_value)
|
||||
#Become chr_prefix aware
|
||||
if chrpos_value.count("chr"):
|
||||
chr_prefix = "chr"
|
||||
else:
|
||||
chr_prefix = ""
|
||||
|
||||
if sequence_build == "UCSC" and chr_prefix == "chr":
|
||||
chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
|
||||
elif sequence_build == "UCSC" and chr_prefix != "chr":
|
||||
chrpos_value = "chr%s:%d" % ( line_fields[chr_column], start_int )
|
||||
elif sequence_build == "NCBI" and chr_prefix == "chr":
|
||||
chrpos_value = "%s:%d".replace("chr","") % ( line_fields[chr_column], start_int )
|
||||
elif sequence_build == "NCBI" and chr_prefix != "chr":
|
||||
chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
|
||||
|
||||
#/JAMES
|
||||
if stop_column:
|
||||
try: stop_int = long(line_fields[stop_column])
|
||||
except: error("Line #%d, Column %d: stop coordinate value '%s' is not an integer" % (counter, stop_column, line_fields[stop_column]))
|
||||
|
|
@ -485,6 +506,9 @@ for line in data_lines:
|
|||
else:
|
||||
output_file.write(line[3:] + "\n")
|
||||
else:
|
||||
#if sequence_build == "UCSC":
|
||||
# output_file.write("chr" + line + "\n")
|
||||
#else:
|
||||
output_file.write(line + "\n")
|
||||
|
||||
output_file.close()
|
||||
|
|
|
|||
Loading…
Reference in New Issue