import sys import os import re import traceback from optparse import OptionParser from IndentedHelpFormatterWithNL import * def error(msg): print("ERROR: %s\n" % msg) parser.print_help() sys.exit(-1) def warn(msg): print("WARNING: %s" % msg) def fatal(msg): print(msg) sys.exit(-1) def join_fields(fields): return OUTPUT_FORMAT_DELIMITER.join(fields) def split_line(line): if delimiter: return line.split(delimiter) else: return line.split() def line_key(line): return chrpos_to_n( split_line(line) ) # Computes an integer key for this line. These keys can be used to sort the lines by reference def chrpos_to_n(lsplit): # Get chr, pos from line chr_value, start_value = None, None # Init in case of error try: split1 = lsplit[0].split(":") # Get chr:start-stop out of the 1st column. chr_value = split1[0].lower().strip() split2 = split1[1].split("-") start_value = split2[0].lower().strip() except: sys.stderr.write("chrom: %s, start: %s. Couldn't parse line: %s \n" % (chr_value, start_value, line)) raise # Covert them to N a = 0 if chr_value.count("_random"): chr_value = chr_value.replace("_random", "") a = 24 # Offset so that "random" chromosomes go last chr_n = a + int(chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24").replace("chr","")) start_n = int(start_value) N = (chr_n * 10**11) + start_n #print("N: " + str(N) + " line: " + line) return N def is_valid_chrpos(line): try: # Compute the line key line_key(line) return True except Exception, e: #print(str(e)) return False # Init cmd-line args description = """ This script takes a text-based tabular INPUT-FILE, validates it, and does whatever is necessary to convert it into the format required by the GenomicAnnotator. The output format has: TODO write this """ parser = OptionParser( description=description, usage="usage: %prog [options] INPUT-FILE", formatter=IndentedHelpFormatterWithNL()) parser.add_option("-v", "--verbose", action="store_true", default=False, help="Verbose.") parser.add_option("-d", "--delimiter", help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].") parser.add_option("-c", "--location-columns", metavar="COLUMNS", help="""The (1-based) column number(s) of the columns in INPUT-FILE that contain coordinates. \n For example, '-c 2,3' means column #2 and column #3 contain coordinate info. COLUMNS can be set to one, two, or three comma-separated numbers:\n 1 number means column1 is of the form 'choromosome:position' or 'chromosome:start-stop'\n 2 numbers means column1 = choromosome, column2 = position.\n 3 numbers means column1 = choromosome, column2 = start position, column3 = stop position.""") parser.add_option("-t", "--coordinate-type", dest="coordinates", metavar="COORD-TYPE", help="""Specifies the coordinate system of INPUT-FILE's chromosome/position column(s). COORD-TYPE can be:\n * ONE-BASED-HALF-OPEN 1-based half-open.\n * POSITIONAL same as ONE-BASED-HALF-OPEN * ZERO-BASED-HALF-OPEN 0-based half-open.\n * OFFSET same as ZERO-BASED-HALF-OPEN.\n * DONT-CHANGE coordinates will be left as-is.\n Note: This setting is used to convert all coordinates into 1-based half-open for the output.""") #parser.add_option("-i", "--include-columns", dest="include_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of all columns to include in the outptut file. Any columns not in this list will be discarded.") #parser.add_option("-e", "--exclude-columns", dest="exclude_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of the columns to include in the outptut file. Any columns not in this list will be discarded.") parser.add_option("-o", "--output-filename", help="Output file path [Default: %default]", default="stdout") (options, args) = parser.parse_args() #print(args) # List of positional args. #print(options.output_filename) #print(options.coordinates) # required #print(options.location_columns) #print(options.delimiter) #print(options.verbose) # Validate and process cmd-line args verbose = options.verbose if verbose: print("%s v0.9" % sys.argv[0]) delimiter = options.delimiter if delimiter and delimiter.lower() == "tab": delimiter = "\t" if len(args) < 1 or not os.access(args[0], os.R_OK): error("Requires a valid INPUT-FILE") input_filename = args[0] if options.coordinates == "POSITIONAL" or options.coordinates == "ONE-BASED-HALF-OPEN" or options.coordinates == "DONT-CHANGE": coords_type = 1 elif options.coordinates == "OFFSET" or options.coordinates == "ZERO-BASED-HALF-OPEN": coords_type = 0 else: if not options.coordinates: error("-t arg must be specified") else: error("Invalid -t value: %s" % str(options.coordinates)) if not options.location_columns: error("-c arg must be specified") loc_columns = options.location_columns.split(",") if len(loc_columns) < 1 or len(loc_columns) > 3: error("-c COLUMNS must specify a comma-separated list of between 1 and 3 numbers.") #if verbose: # print("Parsed -c: " + str(loc_columns)) try: chr_column = int(loc_columns[0]) - 1 start_column = None stop_column = None pos_columns_sorted = [chr_column] if len(loc_columns) > 1: start_column = int(loc_columns[1]) - 1 pos_columns_sorted += [start_column] if len(loc_columns) > 2: stop_column = int(loc_columns[2]) - 1 pos_columns_sorted += [stop_column] pos_columns_sorted.sort(reverse=True) except: error("-c COLUMNS - all elements in the comma-separated list must be integers.") if (chr_column and chr_column < 0) or (start_column and start_column < 0) or (stop_column and stop_column < 0): error("-c COLUMNS - all elements in the comma-separated list must be >= 1") output_filename = options.output_filename if output_filename and output_filename != "stdout" and output_filename != "-" and os.access(output_filename, os.F_OK) and not os.access(output_filename, os.W_OK): error("Unable to write to: %s" % str(options.output_filename)) if verbose: print(" Input file: " + input_filename) print(" Output file: " + output_filename) # Commence processing counter = 0 skipped_lines_counter = 0 header_line_found = False header_fields = [] prepend_lines = [] data_lines = [] previous_n = -1 # Checks whether data is in order need_to_sort = False OUTPUT_FORMAT_DELIMITER = "\t" for line in open(input_filename): line = line.strip() counter+=1 if not header_line_found: if line.startswith("#") or line == "": prepend_lines += [line] else: header_line_found = True header_fields = split_line(line) # Remove all the existing positional columns, and make a new 1st column: 'chrpos' for c in pos_columns_sorted: if c >= len(header_fields): error("Found only %d headers. -c arg is out of range." % len(header_fields) ) header_fields.pop(c) header_fields.insert(0, "chrpos") if len(header_fields) < 2: error("Header appears to have only 1 column in it.") if verbose: print("Found header containing %d columns: [%s]. Changed it to: [%s]." % (len(split_line(line)), " ".join(split_line(line)), " ".join(header_fields))) else: # This is a data line line_fields = split_line(line) # Parse out the chrpos and make it the 1st column if its not already chrpos_value = "" if start_column: # There is more than 1 column of position info in this line # TODO error check line_fields[chr_column] somehow. try: start_int = int(line_fields[start_column]) except: error("Line #%d, Column %d: start coordinate value '%s' is not an integer." % (counter, start_column, line_fields[start_column])) if coords_type == 0: start_int += 1 # Convert to 1-based coords chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int ) if stop_column: try: stop_int = int(line_fields[stop_column]) except: error("Line #%d, Column %d: stop coordinate value '%s' is not an integer" % (counter, stop_column, line_fields[stop_column])) #if coords_type == 0: # stop_int += 1 # Converting to 1-based inclusive, so don't need to add 1 here after all if stop_int != start_int: # If they are equal, chr1:x is the same as chr1:x-x chrpos_value += "-%d" % stop_int # Remove these old position columns and make the valid chrpos column the 1st for c in pos_columns_sorted: line_fields.pop(c) line_fields.insert(0, chrpos_value) else: # There is only 1 column of position info in this line if not re.match(".+\:\d+([-]\d+)?", line_fields[chr_column]): error("Line #%d: Invalid chrpos [%s] in column %d" % (counter, line_fields[chr_column], chr_column )) # Move it to be line 1 chrpos_value = line_fields.pop(chr_column) line_fields.insert(0, chrpos_value) # Validate if len(line_fields) < len(header_fields): warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. The missing fields will be treated as empty." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), )) while len(line_fields) < len(header_fields): line_fields += [OUTPUT_FORMAT_DELIMITER + ""] # Append '' as filler. TODO - make this behavior a cmd-line switchable elif len(line_fields) > len(header_fields): warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. Skippping..." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), )) continue try: n = chrpos_to_n(line_fields) if not need_to_sort and n < previous_n: need_to_sort = True warn("Line %d is out of order. Will need to sort all lines." % counter) previous_n = n except Exception, e: warn("Couldn't parse line: " + " ".join(line_fields) + ". " +str(e) + ". Skipping...") if verbose: traceback.print_exc() skipped_lines_counter += 1 continue data_lines += [ join_fields(line_fields) ] if verbose and skipped_lines_counter: print("Skipped %d / %d lines. (%f%%)" % (skipped_lines_counter, counter, skipped_lines_counter/float(counter))) if need_to_sort: if verbose: print("Sorting %d lines..." % len(data_lines)) data_lines.sort(key=line_key) if verbose: print("Writing data to: " + output_filename) # Write output file if output_filename == "stdout" or output_filename == "-": output_file = sys.stdout else: output_file = open(output_filename, "w+") for line in prepend_lines + [ join_fields(header_fields) ] + data_lines: output_file.write(line + "\n") output_file.close()