86 lines
3.3 KiB
Python
Executable File
86 lines
3.3 KiB
Python
Executable File
import sys
|
|
import os
|
|
import re
|
|
import traceback
|
|
from optparse import OptionParser, OptionGroup
|
|
from IndentedHelpFormatterWithNL import *
|
|
|
|
|
|
# Init cmd-line args
|
|
description = """
|
|
This script runs a command that concatenates all 50 per-chromosome files generated by the GenerateTranscriptToInfo.py script into one big file that can be directly used by the GenomicAnnotator.
|
|
"""
|
|
|
|
parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL())
|
|
|
|
parser.add_option("-d", "--directory", metavar="DIR", dest="directory", help="Specifies the directory that contains the files to concatenate (eg. /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/)")
|
|
parser.add_option("-f", "--filename-prefix", metavar="PREFIX", dest="prefix", help="Filename prefix (eg. refGene)")
|
|
parser.add_option("-u", "--ucsc", dest="ucsc", action="store_true", default=False, help="Generate the output file for use with the NCBI reference genome (this effects chromosome order and naming (eg. M chromosome is first and its called 'chrM' instead of 'MT')).")
|
|
parser.add_option("-n", "--ncbi", dest="ncbi", action="store_true", default=False, help="Generate the output file for use with the UCSC reference genome (this effects chromosome order and naming (eg. MT chromosome is last and its called 'MT' instead of 'chrM')).")
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
|
|
def error(msg):
|
|
print("ERROR: %s. (Rerun with -h to print help info) \n" % msg)
|
|
parser.print_help()
|
|
sys.exit(-1)
|
|
|
|
ucsc = options.ucsc
|
|
ncbi = options.ncbi
|
|
|
|
if not ucsc and not ncbi:
|
|
error("Must run with either -u or -n")
|
|
|
|
directory = options.directory
|
|
if not directory:
|
|
error("Must specify the directory using -d")
|
|
|
|
prefix = options.prefix
|
|
if not prefix:
|
|
error("Must specify filename prefix using -f")
|
|
|
|
contig_chars = []
|
|
if ucsc:
|
|
contig_chars = ["M"] + range(1,23) + ["X", "Y"]
|
|
else:
|
|
contig_chars = range(1,23) + ["X", "Y", "M"]
|
|
|
|
|
|
contigs = []
|
|
contigs += [ "chr" + str(x) for x in contig_chars ]
|
|
|
|
if ucsc: # NCBI doesn't have the _random contigs
|
|
contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M','MT',12,14,20,'X','Y'])) ] # There's no _random chromosomes for chrM,12,14,20,Y
|
|
|
|
#print(contigs)
|
|
|
|
dir_plus_prefix = os.path.join(directory,prefix)
|
|
|
|
# Update the *-big-table-header.txt header file using the header from one of the single-contig files - in case TranscriptToInfo was changed with columns being added or removed.
|
|
command = "head -n 1 " + dir_plus_prefix + ("-big-table-ucsc-%s.txt " % contigs[0]) + " > " + dir_plus_prefix + "-big-table-header.txt"
|
|
print(command)
|
|
os.system(command)
|
|
|
|
|
|
# Concatenate
|
|
header_start = open(dir_plus_prefix + "-big-table-header.txt").read().split("\t")[0]
|
|
command = "cat "
|
|
for contig in contigs:
|
|
command += dir_plus_prefix + "-big-table-ucsc-%s.txt " % contig
|
|
|
|
command += " | grep -v " + header_start
|
|
if ncbi:
|
|
command += "| perl -pe 's/^chrM(.*)$/MT\\1/i' | perl -pe 's/^chr([^p].*)$/\\1/i' " # rename chrM to MT and remove the 'chr' from chromosome names
|
|
|
|
command += " | cat " + dir_plus_prefix + "-big-table-header.txt - "
|
|
|
|
if ucsc:
|
|
command += " > " + dir_plus_prefix + "-big-table-ucsc.txt"
|
|
else:
|
|
command += " > " + dir_plus_prefix + "-big-table-ncbi.txt"
|
|
|
|
print(command)
|
|
os.system(command)
|
|
|