gatk-3.8/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResul...

import sys
import os
import re
import traceback
from optparse import OptionParser, OptionGroup
from IndentedHelpFormatterWithNL import *


# Init cmd-line args
description = """
This script runs a command that concatenates all 50 per-chromosome files generated by the GenerateTranscriptToInfo.py script into one big file that can be directly used by the GenomicAnnotator.
"""

parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL())

parser.add_option("-d", "--directory", metavar="DIR", dest="directory", help="Specifies the directory that contains the files to concatenate (eg. /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/)")
parser.add_option("-f", "--filename-prefix", metavar="PREFIX", dest="prefix", help="Filename prefix (eg. refGene)")
parser.add_option("-u", "--ucsc", dest="ucsc", action="store_true", default=False, help="Generate the output file for use with the NCBI reference genome (this effects chromosome order and naming (eg. M chromosome is first and its called 'chrM' instead of 'MT')).")
parser.add_option("-n", "--ncbi", dest="ncbi", action="store_true", default=False, help="Generate the output file for use with the UCSC reference genome (this effects chromosome order and naming (eg. MT chromosome is last and its called 'MT' instead of 'chrM')).")

(options, args) = parser.parse_args()


def error(msg):
    print("ERROR: %s.        (Rerun with -h to print help info) \n" % msg)
    parser.print_help()
    sys.exit(-1)

ucsc = options.ucsc
ncbi = options.ncbi

if not ucsc and not ncbi:
    error("Must run with either -u or -n")

directory = options.directory
if not directory:
    error("Must specify the directory using -d")

prefix = options.prefix
if not prefix:
    error("Must specify filename prefix using -f")

contig_chars = []
if ucsc:
    contig_chars = ["M"] + range(1,23) + ["X", "Y"]
else:
    contig_chars = range(1,23) + ["X", "Y", "M"]


contigs = []
contigs += [ "chr" + str(x) for x in contig_chars ]

if ucsc:  # NCBI doesn't have the _random contigs
    contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M','MT',12,14,20,'X','Y']))  ]    # There's no _random chromosomes for chrM,12,14,20,Y

#print(contigs)

dir_plus_prefix = os.path.join(directory,prefix)

# Update the *-big-table-header.txt header file using the header from one of the single-contig files - in case TranscriptToInfo was changed with columns being added or removed.
command = "head -n 1 " + dir_plus_prefix + ("-big-table-ucsc-%s.txt " % contigs[0]) + " > " + dir_plus_prefix + "-big-table-header.txt"
print(command)
os.system(command)


# Concatenate
header_start = open(dir_plus_prefix + "-big-table-header.txt").read().split("\t")[0]
command = "cat "
for contig in contigs:
    command += dir_plus_prefix + "-big-table-ucsc-%s.txt " % contig

command += " | grep -v " + header_start
if ncbi:
    command += "| perl -pe 's/^chrM(.*)$/MT\\1/i' | perl -pe 's/^chr([^p].*)$/\\1/i' "    # rename chrM to MT and remove the 'chr' from chromosome names

command += " | cat " + dir_plus_prefix + "-big-table-header.txt - "

if ucsc:
    command += " > " + dir_plus_prefix + "-big-table-ucsc.txt"
else:
    command += " > " + dir_plus_prefix + "-big-table-ncbi.txt"

print(command)
os.system(command)