parser.add_option("-l","--location-columns",metavar="COLUMNS",help="""The (1-based) column number(s) of the columns in INPUT-FILE that contain coordinates. \n
Forexample,'-l 2,3'meanscolumn#2 and column #3 contain coordinate info. COLUMNS can be set to one, two, or three comma-separated numbers:\n
parser.add_option("-c","--input-coords",dest="coordinates",metavar="COORD-TYPE",help="""Specifies the coordinate system of INPUT-FILE's chromosome/position column(s). COORD-TYPE can be:\n
parser.add_option("-t","--output-style",dest="sequence_build",metavar="BUILD",help="Sets the output file's reference build type to either UCSC or NCBI. This should be set based on what reference file will be used when running the GenomicAnnotator. UCSC builds can be specified as either 'hgXX' (eg. hg18) or 'UCSC'. NCBI builds can be specified as 'bXX' (eg. b36) or 'NCBI'. The build type determines chromosome order and naming convention (eg. 'chr1' or '1').")
#parser.add_option("-i", "--include-columns", dest="include_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of all columns to include in the outptut file. Any columns not in this list will be discarded.")
#parser.add_option("-e", "--exclude-columns", dest="exclude_fields", metavar="COLUMNS", help="A comma-separated listing of (1-based) column numbers of the columns to include in the outptut file. Any columns not in this list will be discarded.")
group.add_option("-r","--haplotype-reference-column",metavar="COLUMN",dest="haplotype_reference_column",help="1-based column number of the column to use as haplotypeReference. Specifying this will rename the column to 'haplotypeReference' in the header.")
group.add_option("-a","--haplotype-alternate-column",metavar="COLUMN",dest="haplotype_alternate_column",help="1-based column number of the column to use as haplotypeAlternate. Specifying this will rename the column to 'haplotypeAlternate' in the header.")
group.add_option("-s","--haplotype-strand-column",metavar="COLUMN",dest="haplotype_strand_column",help="1-based column number of the haplotypeStrand. Specifying this will rename the column to 'haplotypeStrand' in the header.")
group.add_option("-k","--keep-original-columns",action="store_true",default=False,dest="keep_copy",help="This flag makes it so that the columns passed to -l, -r, -a, and -s args are not removed when their contents is used to generate the special columns (eg. 'chrpos', 'haplotypeReference', etc..).")
group.add_option("-m","--other-start-columns",metavar="COLUMNS",dest="other_start_columns",help="Comma-separated list of 1 or more column numbers (1-based) representing other columns that contain start coordinates and need to be converted from the coordinate system specified by -c. For example, the refGene table has coordinates for cdsStart which need to be converted along with the chromosome, txStart, and txEnd columns.")
group.add_option("-n","--other-end-columns",metavar="COLUMNS",dest="other_end_columns",help="Comma-separated list of 1 or more column numbers (1-based) representing other columns that contain end coordinates and need to be converted from the coordinate system specified by -c")
group.add_option("-d","--delimiter",help="The delimiter that separates values in a line of INPUT-FILE. Set to 'tab' to make it use tab [Default: spaces].")
parser.add_option_group(group)
(options,args)=parser.parse_args()
deferror(msg):
print("ERROR: %s. (Rerun with -h to print help info) \n"%msg)
#parser.print_help()
sys.exit(-1)
defwarn(msg):
print("WARNING: %s"%msg)
deffatal(msg):
print(msg)
sys.exit(-1)
defjoin_fields(fields):
returnOUTPUT_FORMAT_DELIMITER.join(fields)
defsplit_line(line):
ifdelimiter:
returnline.split(delimiter)
else:
returnline.split()
defline_key(line):
returnchrpos_to_n(split_line(line))
# Computes an integer key for this line. These keys can be used to sort the lines by reference
defchrpos_to_n(lsplit):
# Get chr, pos from line
chr_value,start_value=None,None# Init in case of error
try:
split1=lsplit[0].split(":")# Get chr:start-stop out of the 1st column.
warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. The missing fields will be treated as empty."%(counter,len(line_fields),"".join(line_fields),len(header_fields),"".join(header_fields),))
whilelen(line_fields)<len(header_fields):
line_fields+=[OUTPUT_FORMAT_DELIMITER+""]# Append '' as filler. TODO - make this behavior a cmd-line switchable
eliflen(line_fields)>len(header_fields):
warn("Line #%d: Has %d columns [%s] while header has %d columns [%s]. Skipping..."%(counter,len(line_fields),"".join(line_fields),len(header_fields),"".join(header_fields),))
continue
try:
n=chrpos_to_n(line_fields)
ifnotneed_to_sortandn<previous_n:
need_to_sort=True
warn("Line %d is out of order. Will need to sort all lines."%counter)