From 2183f10a1df457239e12936e82537a8b958aa578 Mon Sep 17 00:00:00 2001 From: weisburd Date: Tue, 13 Apr 2010 13:35:10 +0000 Subject: [PATCH] Script for validating and converting text files into the tabular format required for GenomicAnnotator -B inputs git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3156 348d0f76-0448-11de-a6fe-93d51630548a --- .../IndentedHelpFormatterWithNL.py | 62 ++++ .../TableToAnnotatorRod.py | 306 ++++++++++++++++++ 2 files changed, 368 insertions(+) create mode 100755 python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py create mode 100755 python/genomicAnnotatorScripts/TableToAnnotatorRod.py diff --git a/python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py b/python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py new file mode 100755 index 000000000..8d49d7592 --- /dev/null +++ b/python/genomicAnnotatorScripts/IndentedHelpFormatterWithNL.py @@ -0,0 +1,62 @@ + +from optparse import IndentedHelpFormatter +import textwrap + + +# Taken from http://groups.google.com/group/comp.lang.python/browse_frm/thread/6df6e6b541a15bc2 +# This code makes optparse keep line-breaks in help strings. +class IndentedHelpFormatterWithNL(IndentedHelpFormatter): + def format_description(self, description): + if not description: return "" + desc_width = self.width - self.current_indent + indent = " "*self.current_indent + + # the above is still the same + bits = description.split('\n') + formatted_bits = [ + textwrap.fill(bit, + desc_width, + initial_indent=indent, + subsequent_indent=indent) + for bit in bits] + result = "\n".join(formatted_bits) + "\n" + return result + def format_option(self, option): + # The help for each option consists of two parts: + # * the opt strings and metavars + # eg. ("-x", or "-fFILENAME, --file=FILENAME") + # * the user-supplied help string + # eg. ("turn on expert mode", "read data from FILENAME") + # + # If possible, we write both of these on the same line: + # -x turn on expert mode + # + # But if the opt string list is too long, we put the help + # string on a second line, indented to the same column it would + # start in if it fit on the first line. + # -fFILENAME, --file=FILENAME + # read data from FILENAME + result = [] + opts = self.option_strings[option] + opt_width = self.help_position - self.current_indent - 2 + if len(opts) > opt_width: + opts = "%*s%s\n" % (self.current_indent, "", opts) + indent_first = self.help_position + else: # start help on same line as opts + opts = "%*s%-*s " % (self.current_indent, "", opt_width, opts) + indent_first = 0 + result.append(opts) + if option.help: + help_text = self.expand_default(option) +# Everything is the same up through here + help_lines = [] + for para in help_text.split("\n"): + help_lines.extend(textwrap.wrap(para, self.help_width)) +# Everything is the same after here + result.append("%*s%s\n" % ( + indent_first, "", help_lines[0])) + result.extend(["%*s%s\n" % (self.help_position, "", line) + for line in help_lines[1:]]) + elif opts[-1] != "\n": + result.append("\n") + return "".join(result) diff --git a/python/genomicAnnotatorScripts/TableToAnnotatorRod.py b/python/genomicAnnotatorScripts/TableToAnnotatorRod.py new file mode 100755 index 000000000..f80964d98 --- /dev/null +++ b/python/genomicAnnotatorScripts/TableToAnnotatorRod.py @@ -0,0 +1,306 @@ +import sys +import os +import re +import traceback +from optparse import OptionParser +from IndentedHelpFormatterWithNL import * + +def error(msg): + print("ERROR: %s\n" % msg) + parser.print_help() + sys.exit(-1) + +def warn(msg): + print("WARNING: %s" % msg) + +def fatal(msg): + print(msg) + sys.exit(-1) + + +def join_fields(fields): + return OUTPUT_FORMAT_DELIMITER.join(fields) + + +def split_line(line): + if delimiter: + return line.split(delimiter) + else: + return line.split() + +def line_key(line): + return chrpos_to_n( split_line(line) ) + + +# Computes an integer key for this line. These keys can be used to sort the lines by reference +def chrpos_to_n(lsplit): + # Get chr, pos from line + + chr_value, start_value = None, None # Init in case of error + try: + split1 = lsplit[0].split(":") # Get chr:start-stop out of the 1st column. + chr_value = split1[0].lower().strip() + split2 = split1[1].split("-") + start_value = split2[0].lower().strip() + except: + sys.stderr.write("chrom: %s, start: %s. Couldn't parse line: %s \n" % (chr_value, start_value, line)) + raise + + # Covert them to N + a = 0 + if chr_value.count("_random"): + chr_value = chr_value.replace("_random", "") + a = 24 # Offset so that "random" chromosomes go last + + chr_n = a + int(chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24").replace("chr","")) + start_n = int(start_value) + + N = (chr_n * 10**11) + start_n + + #print("N: " + str(N) + " line: " + line) + return N + +def is_valid_chrpos(line): + try: + # Compute the line key + line_key(line) + return True + except Exception, e: + #print(str(e)) + return False + + + + +# Init cmd-line args +description = """ +This script takes a text-based tabular INPUT-FILE, validates it, and does whatever is necessary to convert it into the format required by the GenomicAnnotator. +The output format has: + TODO write this +""" +parser = OptionParser( description=description, usage="usage: %prog [options] INPUT-FILE", formatter=IndentedHelpFormatterWithNL()) +parser.add_option("-v", "--verbose", action="store_true", default=False, help="Verbose.") +parser.add_option("-d", "--delimiter", help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].") +parser.add_option("-c", "--location-columns", metavar="COLUMNS", help="""The (1-based) column number(s) of the columns in INPUT-FILE that contain coordinates. \n +For example, '-c 2,3' means column #2 and column #3 contain coordinate info. COLUMNS can be set to one, two, or three comma-separated numbers:\n + 1 number means column1 is of the form 'choromosome:position' or 'chromosome:start-stop'\n + 2 numbers means column1 = choromosome, column2 = position.\n + 3 numbers means column1 = choromosome, column2 = start position, column3 = stop position.""") + +parser.add_option("-t", "--coordinate-type", dest="coordinates", metavar="COORD-TYPE", help="""Specifies the coordinate system of INPUT-FILE's chromosome/position column(s). COORD-TYPE can be:\n + * ONE-BASED-HALF-OPEN 1-based half-open.\n + * POSITIONAL same as ONE-BASED-HALF-OPEN + * ZERO-BASED-HALF-OPEN 0-based half-open.\n + * OFFSET same as ZERO-BASED-HALF-OPEN.\n + * DONT-CHANGE coordinates will be left as-is.\n +Note: This setting is used to convert all coordinates into 1-based half-open for the output.""") +#parser.add_option("-i", "--include-columns", dest="include_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of all columns to include in the outptut file. Any columns not in this list will be discarded.") +#parser.add_option("-e", "--exclude-columns", dest="exclude_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of the columns to include in the outptut file. Any columns not in this list will be discarded.") +parser.add_option("-o", "--output-filename", help="Output file path [Default: %default]", default="stdout") + + +(options, args) = parser.parse_args() + +#print(args) # List of positional args. +#print(options.output_filename) +#print(options.coordinates) # required +#print(options.location_columns) +#print(options.delimiter) +#print(options.verbose) + + +# Validate and process cmd-line args + +verbose = options.verbose + +if verbose: + print("%s v0.9" % sys.argv[0]) + +delimiter = options.delimiter +if delimiter and delimiter.lower() == "tab": + delimiter = "\t" + +if len(args) < 1 or not os.access(args[0], os.R_OK): + error("Requires a valid INPUT-FILE") +input_filename = args[0] + + +if options.coordinates == "POSITIONAL" or options.coordinates == "ONE-BASED-HALF-OPEN" or options.coordinates == "DONT-CHANGE": + coords_type = 1 +elif options.coordinates == "OFFSET" or options.coordinates == "ZERO-BASED-HALF-OPEN": + coords_type = 0 +else: + if not options.coordinates: + error("-t arg must be specified") + else: + error("Invalid -t value: %s" % str(options.coordinates)) + +if not options.location_columns: + error("-c arg must be specified") + +loc_columns = options.location_columns.split(",") +if len(loc_columns) < 1 or len(loc_columns) > 3: + error("-c COLUMNS must specify a comma-separated list of between 1 and 3 numbers.") + +#if verbose: +# print("Parsed -c: " + str(loc_columns)) + +try: + chr_column = int(loc_columns[0]) - 1 + start_column = None + stop_column = None + pos_columns_sorted = [chr_column] + if len(loc_columns) > 1: + start_column = int(loc_columns[1]) - 1 + pos_columns_sorted += [start_column] + + if len(loc_columns) > 2: + stop_column = int(loc_columns[2]) - 1 + pos_columns_sorted += [stop_column] + + pos_columns_sorted.sort(reverse=True) +except: + error("-c COLUMNS - all elements in the comma-separated list must be integers.") + +if (chr_column and chr_column < 0) or (start_column and start_column < 0) or (stop_column and stop_column < 0): + error("-c COLUMNS - all elements in the comma-separated list must be >= 1") + +output_filename = options.output_filename +if output_filename and output_filename != "stdout" and output_filename != "-" and os.access(output_filename, os.F_OK) and not os.access(output_filename, os.W_OK): + error("Unable to write to: %s" % str(options.output_filename)) + +if verbose: + print(" Input file: " + input_filename) + print(" Output file: " + output_filename) + + + +# Commence processing +counter = 0 +skipped_lines_counter = 0 +header_line_found = False +header_fields = [] +prepend_lines = [] +data_lines = [] +previous_n = -1 # Checks whether data is in order +need_to_sort = False + +OUTPUT_FORMAT_DELIMITER = "\t" + +for line in open(input_filename): + line = line.strip() + counter+=1 + if not header_line_found: + if line.startswith("#") or line == "": + prepend_lines += [line] + else: + header_line_found = True + header_fields = split_line(line) + + # Remove all the existing positional columns, and make a new 1st column: 'chrpos' + for c in pos_columns_sorted: + if c >= len(header_fields): + error("Found only %d headers. -c arg is out of range." % len(header_fields) ) + + header_fields.pop(c) + header_fields.insert(0, "chrpos") + + if len(header_fields) < 2: + error("Header appears to have only 1 column in it.") + + if verbose: + print("Found header containing %d columns: [%s]. Changed it to: [%s]." % (len(split_line(line)), " ".join(split_line(line)), " ".join(header_fields))) + + else: + # This is a data line + line_fields = split_line(line) + # Parse out the chrpos and make it the 1st column if its not already + chrpos_value = "" + if start_column: + # There is more than 1 column of position info in this line + # TODO error check line_fields[chr_column] somehow. + try: start_int = int(line_fields[start_column]) + except: error("Line #%d, Column %d: start coordinate value '%s' is not an integer." % (counter, start_column, line_fields[start_column])) + + if coords_type == 0: + start_int += 1 # Convert to 1-based coords + + chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int ) + + if stop_column: + try: stop_int = int(line_fields[stop_column]) + except: error("Line #%d, Column %d: stop coordinate value '%s' is not an integer" % (counter, stop_column, line_fields[stop_column])) + + #if coords_type == 0: + # stop_int += 1 + # Converting to 1-based inclusive, so don't need to add 1 here after all + + if stop_int != start_int: # If they are equal, chr1:x is the same as chr1:x-x + chrpos_value += "-%d" % stop_int + + # Remove these old position columns and make the valid chrpos column the 1st + for c in pos_columns_sorted: + line_fields.pop(c) + + line_fields.insert(0, chrpos_value) + + else: + # There is only 1 column of position info in this line + if not re.match(".+\:\d+([-]\d+)?", line_fields[chr_column]): + error("Line #%d: Invalid chrpos [%s] in column %d" % (counter, line_fields[chr_column], chr_column )) + + # Move it to be line 1 + chrpos_value = line_fields.pop(chr_column) + line_fields.insert(0, chrpos_value) + + # Validate + + if len(line_fields) < len(header_fields): + warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. The missing fields will be treated as empty." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), )) + while len(line_fields) < len(header_fields): + line_fields += [OUTPUT_FORMAT_DELIMITER + ""] # Append '' as filler. TODO - make this behavior a cmd-line switchable + + elif len(line_fields) > len(header_fields): + warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. Skippping..." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), )) + continue + + + try: + n = chrpos_to_n(line_fields) + if not need_to_sort and n < previous_n: + need_to_sort = True + warn("Line %d is out of order. Will need to sort all lines." % counter) + previous_n = n + except Exception, e: + warn("Couldn't parse line: " + " ".join(line_fields) + ". " +str(e) + ". Skipping...") + if verbose: traceback.print_exc() + skipped_lines_counter += 1 + continue + + + + + data_lines += [ join_fields(line_fields) ] + +if verbose and skipped_lines_counter: + print("Skipped %d / %d lines. (%f%%)" % (skipped_lines_counter, counter, skipped_lines_counter/float(counter))) +if need_to_sort: + if verbose: + print("Sorting %d lines..." % len(data_lines)) + + data_lines.sort(key=line_key) + + +if verbose: + print("Writing data to: " + output_filename) + +# Write output file +if output_filename == "stdout" or output_filename == "-": + output_file = sys.stdout +else: + output_file = open(output_filename, "w+") + +for line in prepend_lines + [ join_fields(header_fields) ] + data_lines: + output_file.write(line + "\n") + +output_file.close()