Script for validating and converting text files into the tabular format required for GenomicAnnotator -B inputs
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3156 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e413882302
commit
2183f10a1d
|
|
@ -0,0 +1,62 @@
|
||||||
|
|
||||||
|
from optparse import IndentedHelpFormatter
|
||||||
|
import textwrap
|
||||||
|
|
||||||
|
|
||||||
|
# Taken from http://groups.google.com/group/comp.lang.python/browse_frm/thread/6df6e6b541a15bc2
|
||||||
|
# This code makes optparse keep line-breaks in help strings.
|
||||||
|
class IndentedHelpFormatterWithNL(IndentedHelpFormatter):
|
||||||
|
def format_description(self, description):
|
||||||
|
if not description: return ""
|
||||||
|
desc_width = self.width - self.current_indent
|
||||||
|
indent = " "*self.current_indent
|
||||||
|
|
||||||
|
# the above is still the same
|
||||||
|
bits = description.split('\n')
|
||||||
|
formatted_bits = [
|
||||||
|
textwrap.fill(bit,
|
||||||
|
desc_width,
|
||||||
|
initial_indent=indent,
|
||||||
|
subsequent_indent=indent)
|
||||||
|
for bit in bits]
|
||||||
|
result = "\n".join(formatted_bits) + "\n"
|
||||||
|
return result
|
||||||
|
def format_option(self, option):
|
||||||
|
# The help for each option consists of two parts:
|
||||||
|
# * the opt strings and metavars
|
||||||
|
# eg. ("-x", or "-fFILENAME, --file=FILENAME")
|
||||||
|
# * the user-supplied help string
|
||||||
|
# eg. ("turn on expert mode", "read data from FILENAME")
|
||||||
|
#
|
||||||
|
# If possible, we write both of these on the same line:
|
||||||
|
# -x turn on expert mode
|
||||||
|
#
|
||||||
|
# But if the opt string list is too long, we put the help
|
||||||
|
# string on a second line, indented to the same column it would
|
||||||
|
# start in if it fit on the first line.
|
||||||
|
# -fFILENAME, --file=FILENAME
|
||||||
|
# read data from FILENAME
|
||||||
|
result = []
|
||||||
|
opts = self.option_strings[option]
|
||||||
|
opt_width = self.help_position - self.current_indent - 2
|
||||||
|
if len(opts) > opt_width:
|
||||||
|
opts = "%*s%s\n" % (self.current_indent, "", opts)
|
||||||
|
indent_first = self.help_position
|
||||||
|
else: # start help on same line as opts
|
||||||
|
opts = "%*s%-*s " % (self.current_indent, "", opt_width, opts)
|
||||||
|
indent_first = 0
|
||||||
|
result.append(opts)
|
||||||
|
if option.help:
|
||||||
|
help_text = self.expand_default(option)
|
||||||
|
# Everything is the same up through here
|
||||||
|
help_lines = []
|
||||||
|
for para in help_text.split("\n"):
|
||||||
|
help_lines.extend(textwrap.wrap(para, self.help_width))
|
||||||
|
# Everything is the same after here
|
||||||
|
result.append("%*s%s\n" % (
|
||||||
|
indent_first, "", help_lines[0]))
|
||||||
|
result.extend(["%*s%s\n" % (self.help_position, "", line)
|
||||||
|
for line in help_lines[1:]])
|
||||||
|
elif opts[-1] != "\n":
|
||||||
|
result.append("\n")
|
||||||
|
return "".join(result)
|
||||||
|
|
@ -0,0 +1,306 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import traceback
|
||||||
|
from optparse import OptionParser
|
||||||
|
from IndentedHelpFormatterWithNL import *
|
||||||
|
|
||||||
|
def error(msg):
|
||||||
|
print("ERROR: %s\n" % msg)
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
def warn(msg):
|
||||||
|
print("WARNING: %s" % msg)
|
||||||
|
|
||||||
|
def fatal(msg):
|
||||||
|
print(msg)
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
|
||||||
|
def join_fields(fields):
|
||||||
|
return OUTPUT_FORMAT_DELIMITER.join(fields)
|
||||||
|
|
||||||
|
|
||||||
|
def split_line(line):
|
||||||
|
if delimiter:
|
||||||
|
return line.split(delimiter)
|
||||||
|
else:
|
||||||
|
return line.split()
|
||||||
|
|
||||||
|
def line_key(line):
|
||||||
|
return chrpos_to_n( split_line(line) )
|
||||||
|
|
||||||
|
|
||||||
|
# Computes an integer key for this line. These keys can be used to sort the lines by reference
|
||||||
|
def chrpos_to_n(lsplit):
|
||||||
|
# Get chr, pos from line
|
||||||
|
|
||||||
|
chr_value, start_value = None, None # Init in case of error
|
||||||
|
try:
|
||||||
|
split1 = lsplit[0].split(":") # Get chr:start-stop out of the 1st column.
|
||||||
|
chr_value = split1[0].lower().strip()
|
||||||
|
split2 = split1[1].split("-")
|
||||||
|
start_value = split2[0].lower().strip()
|
||||||
|
except:
|
||||||
|
sys.stderr.write("chrom: %s, start: %s. Couldn't parse line: %s \n" % (chr_value, start_value, line))
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Covert them to N
|
||||||
|
a = 0
|
||||||
|
if chr_value.count("_random"):
|
||||||
|
chr_value = chr_value.replace("_random", "")
|
||||||
|
a = 24 # Offset so that "random" chromosomes go last
|
||||||
|
|
||||||
|
chr_n = a + int(chr_value.replace("chrm", "chr0").replace("chrx", "chr23").replace("chry", "chr24").replace("chr",""))
|
||||||
|
start_n = int(start_value)
|
||||||
|
|
||||||
|
N = (chr_n * 10**11) + start_n
|
||||||
|
|
||||||
|
#print("N: " + str(N) + " line: " + line)
|
||||||
|
return N
|
||||||
|
|
||||||
|
def is_valid_chrpos(line):
|
||||||
|
try:
|
||||||
|
# Compute the line key
|
||||||
|
line_key(line)
|
||||||
|
return True
|
||||||
|
except Exception, e:
|
||||||
|
#print(str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Init cmd-line args
|
||||||
|
description = """
|
||||||
|
This script takes a text-based tabular INPUT-FILE, validates it, and does whatever is necessary to convert it into the format required by the GenomicAnnotator.
|
||||||
|
The output format has:
|
||||||
|
TODO write this
|
||||||
|
"""
|
||||||
|
parser = OptionParser( description=description, usage="usage: %prog [options] INPUT-FILE", formatter=IndentedHelpFormatterWithNL())
|
||||||
|
parser.add_option("-v", "--verbose", action="store_true", default=False, help="Verbose.")
|
||||||
|
parser.add_option("-d", "--delimiter", help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].")
|
||||||
|
parser.add_option("-c", "--location-columns", metavar="COLUMNS", help="""The (1-based) column number(s) of the columns in INPUT-FILE that contain coordinates. \n
|
||||||
|
For example, '-c 2,3' means column #2 and column #3 contain coordinate info. COLUMNS can be set to one, two, or three comma-separated numbers:\n
|
||||||
|
1 number means column1 is of the form 'choromosome:position' or 'chromosome:start-stop'\n
|
||||||
|
2 numbers means column1 = choromosome, column2 = position.\n
|
||||||
|
3 numbers means column1 = choromosome, column2 = start position, column3 = stop position.""")
|
||||||
|
|
||||||
|
parser.add_option("-t", "--coordinate-type", dest="coordinates", metavar="COORD-TYPE", help="""Specifies the coordinate system of INPUT-FILE's chromosome/position column(s). COORD-TYPE can be:\n
|
||||||
|
* ONE-BASED-HALF-OPEN 1-based half-open.\n
|
||||||
|
* POSITIONAL same as ONE-BASED-HALF-OPEN
|
||||||
|
* ZERO-BASED-HALF-OPEN 0-based half-open.\n
|
||||||
|
* OFFSET same as ZERO-BASED-HALF-OPEN.\n
|
||||||
|
* DONT-CHANGE coordinates will be left as-is.\n
|
||||||
|
Note: This setting is used to convert all coordinates into 1-based half-open for the output.""")
|
||||||
|
#parser.add_option("-i", "--include-columns", dest="include_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of all columns to include in the outptut file. Any columns not in this list will be discarded.")
|
||||||
|
#parser.add_option("-e", "--exclude-columns", dest="exclude_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of the columns to include in the outptut file. Any columns not in this list will be discarded.")
|
||||||
|
parser.add_option("-o", "--output-filename", help="Output file path [Default: %default]", default="stdout")
|
||||||
|
|
||||||
|
|
||||||
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
|
#print(args) # List of positional args.
|
||||||
|
#print(options.output_filename)
|
||||||
|
#print(options.coordinates) # required
|
||||||
|
#print(options.location_columns)
|
||||||
|
#print(options.delimiter)
|
||||||
|
#print(options.verbose)
|
||||||
|
|
||||||
|
|
||||||
|
# Validate and process cmd-line args
|
||||||
|
|
||||||
|
verbose = options.verbose
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("%s v0.9" % sys.argv[0])
|
||||||
|
|
||||||
|
delimiter = options.delimiter
|
||||||
|
if delimiter and delimiter.lower() == "tab":
|
||||||
|
delimiter = "\t"
|
||||||
|
|
||||||
|
if len(args) < 1 or not os.access(args[0], os.R_OK):
|
||||||
|
error("Requires a valid INPUT-FILE")
|
||||||
|
input_filename = args[0]
|
||||||
|
|
||||||
|
|
||||||
|
if options.coordinates == "POSITIONAL" or options.coordinates == "ONE-BASED-HALF-OPEN" or options.coordinates == "DONT-CHANGE":
|
||||||
|
coords_type = 1
|
||||||
|
elif options.coordinates == "OFFSET" or options.coordinates == "ZERO-BASED-HALF-OPEN":
|
||||||
|
coords_type = 0
|
||||||
|
else:
|
||||||
|
if not options.coordinates:
|
||||||
|
error("-t arg must be specified")
|
||||||
|
else:
|
||||||
|
error("Invalid -t value: %s" % str(options.coordinates))
|
||||||
|
|
||||||
|
if not options.location_columns:
|
||||||
|
error("-c arg must be specified")
|
||||||
|
|
||||||
|
loc_columns = options.location_columns.split(",")
|
||||||
|
if len(loc_columns) < 1 or len(loc_columns) > 3:
|
||||||
|
error("-c COLUMNS must specify a comma-separated list of between 1 and 3 numbers.")
|
||||||
|
|
||||||
|
#if verbose:
|
||||||
|
# print("Parsed -c: " + str(loc_columns))
|
||||||
|
|
||||||
|
try:
|
||||||
|
chr_column = int(loc_columns[0]) - 1
|
||||||
|
start_column = None
|
||||||
|
stop_column = None
|
||||||
|
pos_columns_sorted = [chr_column]
|
||||||
|
if len(loc_columns) > 1:
|
||||||
|
start_column = int(loc_columns[1]) - 1
|
||||||
|
pos_columns_sorted += [start_column]
|
||||||
|
|
||||||
|
if len(loc_columns) > 2:
|
||||||
|
stop_column = int(loc_columns[2]) - 1
|
||||||
|
pos_columns_sorted += [stop_column]
|
||||||
|
|
||||||
|
pos_columns_sorted.sort(reverse=True)
|
||||||
|
except:
|
||||||
|
error("-c COLUMNS - all elements in the comma-separated list must be integers.")
|
||||||
|
|
||||||
|
if (chr_column and chr_column < 0) or (start_column and start_column < 0) or (stop_column and stop_column < 0):
|
||||||
|
error("-c COLUMNS - all elements in the comma-separated list must be >= 1")
|
||||||
|
|
||||||
|
output_filename = options.output_filename
|
||||||
|
if output_filename and output_filename != "stdout" and output_filename != "-" and os.access(output_filename, os.F_OK) and not os.access(output_filename, os.W_OK):
|
||||||
|
error("Unable to write to: %s" % str(options.output_filename))
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" Input file: " + input_filename)
|
||||||
|
print(" Output file: " + output_filename)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Commence processing
|
||||||
|
counter = 0
|
||||||
|
skipped_lines_counter = 0
|
||||||
|
header_line_found = False
|
||||||
|
header_fields = []
|
||||||
|
prepend_lines = []
|
||||||
|
data_lines = []
|
||||||
|
previous_n = -1 # Checks whether data is in order
|
||||||
|
need_to_sort = False
|
||||||
|
|
||||||
|
OUTPUT_FORMAT_DELIMITER = "\t"
|
||||||
|
|
||||||
|
for line in open(input_filename):
|
||||||
|
line = line.strip()
|
||||||
|
counter+=1
|
||||||
|
if not header_line_found:
|
||||||
|
if line.startswith("#") or line == "":
|
||||||
|
prepend_lines += [line]
|
||||||
|
else:
|
||||||
|
header_line_found = True
|
||||||
|
header_fields = split_line(line)
|
||||||
|
|
||||||
|
# Remove all the existing positional columns, and make a new 1st column: 'chrpos'
|
||||||
|
for c in pos_columns_sorted:
|
||||||
|
if c >= len(header_fields):
|
||||||
|
error("Found only %d headers. -c arg is out of range." % len(header_fields) )
|
||||||
|
|
||||||
|
header_fields.pop(c)
|
||||||
|
header_fields.insert(0, "chrpos")
|
||||||
|
|
||||||
|
if len(header_fields) < 2:
|
||||||
|
error("Header appears to have only 1 column in it.")
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Found header containing %d columns: [%s]. Changed it to: [%s]." % (len(split_line(line)), " ".join(split_line(line)), " ".join(header_fields)))
|
||||||
|
|
||||||
|
else:
|
||||||
|
# This is a data line
|
||||||
|
line_fields = split_line(line)
|
||||||
|
# Parse out the chrpos and make it the 1st column if its not already
|
||||||
|
chrpos_value = ""
|
||||||
|
if start_column:
|
||||||
|
# There is more than 1 column of position info in this line
|
||||||
|
# TODO error check line_fields[chr_column] somehow.
|
||||||
|
try: start_int = int(line_fields[start_column])
|
||||||
|
except: error("Line #%d, Column %d: start coordinate value '%s' is not an integer." % (counter, start_column, line_fields[start_column]))
|
||||||
|
|
||||||
|
if coords_type == 0:
|
||||||
|
start_int += 1 # Convert to 1-based coords
|
||||||
|
|
||||||
|
chrpos_value = "%s:%d" % ( line_fields[chr_column], start_int )
|
||||||
|
|
||||||
|
if stop_column:
|
||||||
|
try: stop_int = int(line_fields[stop_column])
|
||||||
|
except: error("Line #%d, Column %d: stop coordinate value '%s' is not an integer" % (counter, stop_column, line_fields[stop_column]))
|
||||||
|
|
||||||
|
#if coords_type == 0:
|
||||||
|
# stop_int += 1
|
||||||
|
# Converting to 1-based inclusive, so don't need to add 1 here after all
|
||||||
|
|
||||||
|
if stop_int != start_int: # If they are equal, chr1:x is the same as chr1:x-x
|
||||||
|
chrpos_value += "-%d" % stop_int
|
||||||
|
|
||||||
|
# Remove these old position columns and make the valid chrpos column the 1st
|
||||||
|
for c in pos_columns_sorted:
|
||||||
|
line_fields.pop(c)
|
||||||
|
|
||||||
|
line_fields.insert(0, chrpos_value)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# There is only 1 column of position info in this line
|
||||||
|
if not re.match(".+\:\d+([-]\d+)?", line_fields[chr_column]):
|
||||||
|
error("Line #%d: Invalid chrpos [%s] in column %d" % (counter, line_fields[chr_column], chr_column ))
|
||||||
|
|
||||||
|
# Move it to be line 1
|
||||||
|
chrpos_value = line_fields.pop(chr_column)
|
||||||
|
line_fields.insert(0, chrpos_value)
|
||||||
|
|
||||||
|
# Validate
|
||||||
|
|
||||||
|
if len(line_fields) < len(header_fields):
|
||||||
|
warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. The missing fields will be treated as empty." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), ))
|
||||||
|
while len(line_fields) < len(header_fields):
|
||||||
|
line_fields += [OUTPUT_FORMAT_DELIMITER + ""] # Append '' as filler. TODO - make this behavior a cmd-line switchable
|
||||||
|
|
||||||
|
elif len(line_fields) > len(header_fields):
|
||||||
|
warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. Skippping..." % (counter, len(line_fields), " ".join(line_fields), len(header_fields), " ".join(header_fields), ))
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
n = chrpos_to_n(line_fields)
|
||||||
|
if not need_to_sort and n < previous_n:
|
||||||
|
need_to_sort = True
|
||||||
|
warn("Line %d is out of order. Will need to sort all lines." % counter)
|
||||||
|
previous_n = n
|
||||||
|
except Exception, e:
|
||||||
|
warn("Couldn't parse line: " + " ".join(line_fields) + ". " +str(e) + ". Skipping...")
|
||||||
|
if verbose: traceback.print_exc()
|
||||||
|
skipped_lines_counter += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
data_lines += [ join_fields(line_fields) ]
|
||||||
|
|
||||||
|
if verbose and skipped_lines_counter:
|
||||||
|
print("Skipped %d / %d lines. (%f%%)" % (skipped_lines_counter, counter, skipped_lines_counter/float(counter)))
|
||||||
|
if need_to_sort:
|
||||||
|
if verbose:
|
||||||
|
print("Sorting %d lines..." % len(data_lines))
|
||||||
|
|
||||||
|
data_lines.sort(key=line_key)
|
||||||
|
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Writing data to: " + output_filename)
|
||||||
|
|
||||||
|
# Write output file
|
||||||
|
if output_filename == "stdout" or output_filename == "-":
|
||||||
|
output_file = sys.stdout
|
||||||
|
else:
|
||||||
|
output_file = open(output_filename, "w+")
|
||||||
|
|
||||||
|
for line in prepend_lines + [ join_fields(header_fields) ] + data_lines:
|
||||||
|
output_file.write(line + "\n")
|
||||||
|
|
||||||
|
output_file.close()
|
||||||
Loading…
Reference in New Issue