2010-04-30 23:47:08 +08:00
import sys
import os
import re
import traceback
from optparse import OptionParser , OptionGroup
from IndentedHelpFormatterWithNL import *
# Init cmd-line args
description = """
2010-05-18 10:45:11 +08:00
This script runs a command that concatenates all 50 per - chromosome files generated by the GenerateTranscriptToInfo . py script into one big file that can be directly used by the GenomicAnnotator .
2010-04-30 23:47:08 +08:00
"""
parser = OptionParser ( description = description , usage = " usage: % prog [options] " , formatter = IndentedHelpFormatterWithNL ( ) )
2010-05-18 10:45:11 +08:00
parser . add_option ( " -d " , " --directory " , metavar = " DIR " , dest = " directory " , help = " Specifies the directory that contains the files to concatenate (eg. /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/) " )
parser . add_option ( " -f " , " --filename-prefix " , metavar = " PREFIX " , dest = " prefix " , help = " Filename prefix (eg. refGene) " )
2010-05-01 01:26:00 +08:00
parser . add_option ( " -u " , " --ucsc " , dest = " ucsc " , action = " store_true " , default = False , help = " Generate the output file for use with the NCBI reference genome (this effects chromosome order and naming (eg. M chromosome is first and its called ' chrM ' instead of ' MT ' )). " )
parser . add_option ( " -n " , " --ncbi " , dest = " ncbi " , action = " store_true " , default = False , help = " Generate the output file for use with the UCSC reference genome (this effects chromosome order and naming (eg. MT chromosome is last and its called ' MT ' instead of ' chrM ' )). " )
2010-04-30 23:47:08 +08:00
( options , args ) = parser . parse_args ( )
2010-05-01 01:26:00 +08:00
2010-04-30 23:47:08 +08:00
def error ( msg ) :
print ( " ERROR: %s . (Rerun with -h to print help info) \n " % msg )
2010-05-01 01:26:00 +08:00
parser . print_help ( )
2010-04-30 23:47:08 +08:00
sys . exit ( - 1 )
2010-05-01 01:26:00 +08:00
ucsc = options . ucsc
ncbi = options . ncbi
if not ucsc and not ncbi :
error ( " Must run with either -u or -n " )
2010-05-18 10:45:11 +08:00
directory = options . directory
if not directory :
error ( " Must specify the directory using -d " )
prefix = options . prefix
if not prefix :
error ( " Must specify filename prefix using -f " )
2010-04-30 23:47:08 +08:00
2010-05-01 01:26:00 +08:00
contig_chars = [ ]
if ucsc :
contig_chars = [ " M " ] + range ( 1 , 23 ) + [ " X " , " Y " ]
else :
contig_chars = range ( 1 , 23 ) + [ " X " , " Y " , " M " ]
2010-04-30 23:47:08 +08:00
contigs = [ ]
contigs + = [ " chr " + str ( x ) for x in contig_chars ]
2010-05-01 01:26:00 +08:00
if ucsc : # NCBI doesn't have the _random contigs
contigs + = [ " chr " + str ( x ) + " _random " for x in set ( contig_chars ) . difference ( set ( [ ' M ' , ' MT ' , 12 , 14 , 20 , ' X ' , ' Y ' ] ) ) ] # There's no _random chromosomes for chrM,12,14,20,Y
2010-04-30 23:47:08 +08:00
#print(contigs)
2010-05-18 10:45:11 +08:00
dir_plus_prefix = os . path . join ( directory , prefix )
2010-04-30 23:47:08 +08:00
2010-05-18 10:45:11 +08:00
# Update the *-big-table-header.txt header file using the header from one of the single-contig files.
#command = "head -n 1 " + dir_plus_prefix + ("-big-table-ucsc-%s.txt " % contigs[0]) + " > " + dir_plus_prefix + "-big-table-header.txt"
#print(command)
#os.system(command)
2010-04-30 23:47:08 +08:00
# Concatenate
2010-05-18 10:45:11 +08:00
header_start = open ( dir_plus_prefix + " -big-table-header.txt " ) . read ( ) . split ( " \t " ) [ 0 ]
2010-04-30 23:47:08 +08:00
command = " cat "
for contig in contigs :
2010-05-18 10:45:11 +08:00
command + = dir_plus_prefix + " -big-table-ucsc- %s .txt " % contig
2010-04-30 23:47:08 +08:00
command + = " | grep -v " + header_start
2010-05-01 01:26:00 +08:00
if ncbi :
command + = " | perl -pe ' s/^chrM(.*)$/MT \\ 1/i ' | perl -pe ' s/^chr([^p].*)$/ \\ 1/i ' " # rename chrM to MT and remove the 'chr' from chromosome names
2010-05-18 10:45:11 +08:00
command + = " | cat " + dir_plus_prefix + " -big-table-header.txt - "
2010-04-30 23:47:08 +08:00
2010-05-01 01:26:00 +08:00
if ucsc :
2010-05-18 10:45:11 +08:00
command + = " > " + dir_plus_prefix + " -big-table-ucsc.txt "
2010-05-01 01:26:00 +08:00
else :
2010-05-18 10:45:11 +08:00
command + = " > " + dir_plus_prefix + " -big-table-ncbi.txt "
2010-05-01 01:26:00 +08:00
2010-04-30 23:47:08 +08:00
print ( command )
os . system ( command )