From 28f746b76ae37e175d6cfc55fcf2a09e790fcf0c Mon Sep 17 00:00:00 2001
From: weisburd <weisburd@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 30 Apr 2010 17:26:00 +0000
Subject: [PATCH] Added option to generate UCSC or NCBI sequence

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3283 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../ConcatTranscriptToInfoResults.py          | 33 ++++++++++++++++---
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py
index 73c3b9f97..107ac8a2f 100755
--- a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py
+++ b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py
@@ -9,27 +9,43 @@ run_locally = True
 
 # Init cmd-line args
 description = """
-This script creates and runs the command line that concatenates all 50 results of the GenerateTranscriptToInfo.py script into one big file that can be directly used with the GenomicAnnotator.
+This script runs a command that concatenates all 50 results of the GenerateTranscriptToInfo.py script into one big file that can be directly used by the GenomicAnnotator.
 """
 
 parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL())
 
 parser.add_option("-r", "--refgene-directory", metavar="DIR", dest="refgene_dir", help="Specifies the directory that contains refGene-converted.txt", default="/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/")
 
+parser.add_option("-u", "--ucsc", dest="ucsc", action="store_true", default=False, help="Generate the output file for use with the NCBI reference genome (this effects chromosome order and naming (eg. M chromosome is first and its called 'chrM' instead of 'MT')).")
+parser.add_option("-n", "--ncbi", dest="ncbi", action="store_true", default=False, help="Generate the output file for use with the UCSC reference genome (this effects chromosome order and naming (eg. MT chromosome is last and its called 'MT' instead of 'chrM')).")
+
 (options, args) = parser.parse_args()
 
+
 def error(msg):
     print("ERROR: %s.        (Rerun with -h to print help info) \n" % msg)
-    #parser.print_help()
+    parser.print_help()
     sys.exit(-1)
 
+ucsc = options.ucsc
+ncbi = options.ncbi
+
+if not ucsc and not ncbi: 
+    error("Must run with either -u or -n")    
 
 
-contig_chars = ["M"] + range(1,23) + ["X", "Y"]
+contig_chars = []
+if ucsc:
+    contig_chars = ["M"] + range(1,23) + ["X", "Y"]    
+else:
+    contig_chars = range(1,23) + ["X", "Y", "M"]
+
 
 contigs = []
 contigs += [ "chr" + str(x) for x in contig_chars ] 
-contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M',12,14,20,'X','Y']))  ]    # There's no _random chromosomes for chrM,12,14,20,Y
+
+if ucsc:  # NCBI doesn't have the _random contigs
+    contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M','MT',12,14,20,'X','Y']))  ]    # There's no _random chromosomes for chrM,12,14,20,Y
 
 #print(contigs)
 
@@ -48,9 +64,16 @@ for contig in contigs:
     command += options.refgene_dir+"/refGene-big-table-ucsc-%s.txt " % contig
     
 command += " | grep -v " + header_start 
+if ncbi:
+    command += "| perl -pe 's/^chrM(.*)$/MT\\1/i' | perl -pe 's/^chr([^p].*)$/\\1/i' "    # rename chrM to MT and remove the 'chr' from chromosome names 
+
 command += " | cat " + options.refgene_dir+"/refGene-big-table-header.txt - "
 
-command += " > " + options.refgene_dir+"/refGene-big-table-ucsc.txt"        
+if ucsc:
+    command += " > " + options.refgene_dir+"/refGene-big-table-ucsc.txt"        
+else:
+    command += " > " + options.refgene_dir+"/refGene-big-table-ncbi.txt"        
+
 print(command)
 os.system(command)