From c214056d884edc405e946e5d66b9085e4d1bcf4a Mon Sep 17 00:00:00 2001 From: weisburd Date: Fri, 30 Apr 2010 15:47:08 +0000 Subject: [PATCH] Script for concatenating results of GenerateTranscriptToInfo.py into one big file git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3279 348d0f76-0448-11de-a6fe-93d51630548a --- .../ConcatTranscriptToInfoResults.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py diff --git a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py new file mode 100755 index 000000000..73c3b9f97 --- /dev/null +++ b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py @@ -0,0 +1,56 @@ +import sys +import os +import re +import traceback +from optparse import OptionParser, OptionGroup +from IndentedHelpFormatterWithNL import * + +run_locally = True + +# Init cmd-line args +description = """ +This script creates and runs the command line that concatenates all 50 results of the GenerateTranscriptToInfo.py script into one big file that can be directly used with the GenomicAnnotator. +""" + +parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL()) + +parser.add_option("-r", "--refgene-directory", metavar="DIR", dest="refgene_dir", help="Specifies the directory that contains refGene-converted.txt", default="/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/") + +(options, args) = parser.parse_args() + +def error(msg): + print("ERROR: %s. (Rerun with -h to print help info) \n" % msg) + #parser.print_help() + sys.exit(-1) + + + +contig_chars = ["M"] + range(1,23) + ["X", "Y"] + +contigs = [] +contigs += [ "chr" + str(x) for x in contig_chars ] +contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M',12,14,20,'X','Y'])) ] # There's no _random chromosomes for chrM,12,14,20,Y + +#print(contigs) + + + +# Update the refGene-big-table-header.txt header file using the header from one of the single-contig files. +command = "head -n 1 " + (options.refgene_dir + "/refGene-big-table-ucsc-%s.txt " % contigs[0]) + " > " + options.refgene_dir + "/refGene-big-table-header.txt" +print(command) +os.system(command) + + +# Concatenate +header_start = open(options.refgene_dir+"/refGene-big-table-header.txt").read().split("\t")[0] +command = "cat " +for contig in contigs: + command += options.refgene_dir+"/refGene-big-table-ucsc-%s.txt " % contig + +command += " | grep -v " + header_start +command += " | cat " + options.refgene_dir+"/refGene-big-table-header.txt - " + +command += " > " + options.refgene_dir+"/refGene-big-table-ucsc.txt" +print(command) +os.system(command) +