From 04e14ef85a8c91455453617bea179580db2e5d58 Mon Sep 17 00:00:00 2001
From: weisburd <weisburd@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Tue, 18 May 2010 02:45:11 +0000
Subject: [PATCH] Refactored so it could be used for knownGene and CCDS as well
 as refGene

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3373 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../ConcatTranscriptToInfoResults.py          | 34 ++++++-----
 .../GenerateTranscriptToInfo.py               | 60 +++++++++++++------
 2 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py
index 107ac8a2f..f3f7ad5c9 100755
--- a/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py
+++ b/python/genomicAnnotatorScripts/ConcatTranscriptToInfoResults.py
@@ -5,17 +5,16 @@ import traceback
 from optparse import OptionParser, OptionGroup
 from IndentedHelpFormatterWithNL import *
 
-run_locally = True
 
 # Init cmd-line args
 description = """
-This script runs a command that concatenates all 50 results of the GenerateTranscriptToInfo.py script into one big file that can be directly used by the GenomicAnnotator.
+This script runs a command that concatenates all 50 per-chromosome files generated by the GenerateTranscriptToInfo.py script into one big file that can be directly used by the GenomicAnnotator.
 """
 
 parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL())
 
-parser.add_option("-r", "--refgene-directory", metavar="DIR", dest="refgene_dir", help="Specifies the directory that contains refGene-converted.txt", default="/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/")
-
+parser.add_option("-d", "--directory", metavar="DIR", dest="directory", help="Specifies the directory that contains the files to concatenate (eg. /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/)")
+parser.add_option("-f", "--filename-prefix", metavar="PREFIX", dest="prefix", help="Filename prefix (eg. refGene)")
 parser.add_option("-u", "--ucsc", dest="ucsc", action="store_true", default=False, help="Generate the output file for use with the NCBI reference genome (this effects chromosome order and naming (eg. M chromosome is first and its called 'chrM' instead of 'MT')).")
 parser.add_option("-n", "--ncbi", dest="ncbi", action="store_true", default=False, help="Generate the output file for use with the UCSC reference genome (this effects chromosome order and naming (eg. MT chromosome is last and its called 'MT' instead of 'chrM')).")
 
@@ -33,6 +32,13 @@ ncbi = options.ncbi
 if not ucsc and not ncbi: 
     error("Must run with either -u or -n")    
 
+directory = options.directory
+if not directory:
+    error("Must specify the directory using -d")
+
+prefix = options.prefix
+if not prefix:
+    error("Must specify filename prefix using -f")
 
 contig_chars = []
 if ucsc:
@@ -49,30 +55,30 @@ if ucsc:  # NCBI doesn't have the _random contigs
 
 #print(contigs)
 
+dir_plus_prefix = os.path.join(directory,prefix)
 
-
-# Update the refGene-big-table-header.txt header file using the header from one of the single-contig files.
-command = "head -n 1 " + (options.refgene_dir + "/refGene-big-table-ucsc-%s.txt " % contigs[0]) + " > " + options.refgene_dir + "/refGene-big-table-header.txt"
-print(command)
-os.system(command)
+# Update the *-big-table-header.txt header file using the header from one of the single-contig files.
+#command = "head -n 1 " + dir_plus_prefix + ("-big-table-ucsc-%s.txt " % contigs[0]) + " > " + dir_plus_prefix + "-big-table-header.txt"
+#print(command)
+#os.system(command)
 
 
 # Concatenate
-header_start = open(options.refgene_dir+"/refGene-big-table-header.txt").read().split("\t")[0]
+header_start = open(dir_plus_prefix + "-big-table-header.txt").read().split("\t")[0]
 command = "cat "
 for contig in contigs:
-    command += options.refgene_dir+"/refGene-big-table-ucsc-%s.txt " % contig
+    command += dir_plus_prefix + "-big-table-ucsc-%s.txt " % contig
     
 command += " | grep -v " + header_start 
 if ncbi:
     command += "| perl -pe 's/^chrM(.*)$/MT\\1/i' | perl -pe 's/^chr([^p].*)$/\\1/i' "    # rename chrM to MT and remove the 'chr' from chromosome names 
 
-command += " | cat " + options.refgene_dir+"/refGene-big-table-header.txt - "
+command += " | cat " + dir_plus_prefix + "-big-table-header.txt - "
 
 if ucsc:
-    command += " > " + options.refgene_dir+"/refGene-big-table-ucsc.txt"        
+    command += " > " + dir_plus_prefix + "-big-table-ucsc.txt"        
 else:
-    command += " > " + options.refgene_dir+"/refGene-big-table-ncbi.txt"        
+    command += " > " + dir_plus_prefix + "-big-table-ncbi.txt"        
 
 print(command)
 os.system(command)
diff --git a/python/genomicAnnotatorScripts/GenerateTranscriptToInfo.py b/python/genomicAnnotatorScripts/GenerateTranscriptToInfo.py
index 2dba17882..16a6700a7 100755
--- a/python/genomicAnnotatorScripts/GenerateTranscriptToInfo.py
+++ b/python/genomicAnnotatorScripts/GenerateTranscriptToInfo.py
@@ -5,22 +5,25 @@ import traceback
 from optparse import OptionParser, OptionGroup
 from IndentedHelpFormatterWithNL import *
 
-run_locally = False
 
 # Init cmd-line args
 description = """
-This script submits LSF jobs that run the GATK TranscriptToInfo Walker on each individual chromosome. This reduces the overall runtime to a managable ammount (eg. < 1 day).
+This script submits LSF jobs that run the GATK TranscriptToInfo Walker on each individual chromosome. This reduces the overall runtime to a manageable ammount (eg. < 1 day).
 
 NOTE: This script must be run in the top level dir of your GATK checkout area.
 """
 
 parser = OptionParser( description=description, usage="usage: %prog [options] ", formatter=IndentedHelpFormatterWithNL())
 
-parser.add_option("-d", "--refgene-directory", metavar="DIR", dest="refgene_dir", help="Specifies the directory that contains refGene-converted.txt", default="/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/hg18_b36/raw/")
 
-parser.add_option("-p", "--print", dest="output", action="store_true", default=False, help="Only print the commands to standard out, don't actually execute them yet.")
+parser.add_option("-i", "--transcript-table", metavar="PATH", dest="transcript_table", help="Path of the file that contains the transcript data in AnnotatorROD format (eg. /humgen/gsa-hpprojects/GATK/data/Annotations/refseq/raw/refGene-converted.txt)")
+parser.add_option("-f", "--output-filename-prefix", metavar="PREFIX", dest="prefix", help="Output filename prefix (eg. refGene)")
+parser.add_option("-p", "--print", dest="justprint", action="store_true", default=False, help="Only print the commands to standard out, don't actually execute them yet.")
 parser.add_option("-e", "--execute", dest="execute", action="store_true", default=False, help="Executes the commands. This flag acts as a confirmation that you want to proceed with launching the processes.")
 parser.add_option("-l", "--locally", dest="run_locally", action="store_true", default=False, help="Don't submit the commands to LSF. Run them sequentially on the current machine.")
+parser.add_option("-R", "--reference", metavar="PATH", dest="reference", help="Specifies the path of the reference file to use.", default="/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
+parser.add_option("-n", "--gene-name-columns", dest="gene_name_columns", metavar="GENE_NAMES", help="Comma-separated list of column names that contain gene names. This arg is passed through to the GenomicAnnotator. The GenomicAnnotator docs have more details on this.")
+parser.add_option("-q", "--queue", dest="queue", metavar="QUEUE", help="Specifies the LSF queue to use.", default="solexa")
 
 (options, args) = parser.parse_args()
 
@@ -30,14 +33,32 @@ def error(msg):
     sys.exit(-1)
 
 run = options.execute
-output = options.output
+justprint = options.justprint
 run_locally = options.run_locally
 
-if not run and not output: 
+if not run and not justprint:
     error("Must run with either -p or -e")    
 
+transcript_table = options.transcript_table
+if not transcript_table or not os.access(transcript_table, os.R_OK):
+    error("Must specify a valid transcript table file path using -i")    
 
+gene_name_columns = options.gene_name_columns
+if not gene_name_columns:
+    error("Must specify gene name columns using -n")
 
+output_file_prefix = options.prefix
+if not output_file_prefix:
+    error("Must specify the output file prefix using -f")
+
+reference = options.reference
+if not os.access(reference, os.R_OK):
+    error("Couldn't access reference file: "+ reference)
+
+queue = options.queue
+
+transcript_dir = os.path.dirname(transcript_table)
+logs_dir = os.path.join(transcript_dir,"logs")
 
 contig_chars = ["M"] + range(1,23) + ["X", "Y"]
 
@@ -45,29 +66,32 @@ contigs = []
 contigs += [ "chr" + str(x) for x in contig_chars ] 
 contigs += [ "chr" + str(x) + "_random" for x in set( contig_chars ).difference(set(['M',12,14,20,'X','Y']))  ]    # There are no "_random" chromosomes for chrM,12,14,20,Y
 
-#print(contigs)
-
 
 if run:
     print("Deleting any previous logs...")
-    os.system("rm " + options.refgene_dir+"/logs/bsub_*_log.txt")
-for contig in contigs:
-        
+    os.system("rm " + os.path.join(logs_dir,"bsub_*_log.txt"))
+    os.system("mkdir " + logs_dir)
+
+for contig in contigs:        
     if contig.count("random") or contig.lower().count("chrm"):
-        MEMORY_USAGE = 10  #Gigabytes
+        MEMORY_USAGE = 10  # Gigabytes
         EXCLUSIVE = ""
     else:
         if run_locally:
-            MEMORY_USAGE = 32
+            MEMORY_USAGE = 64
         else:
-            MEMORY_USAGE = 15
+            MEMORY_USAGE = 32
         EXCLUSIVE = ""
             
-    command = "java -Xmx"+str(MEMORY_USAGE)+"g -jar dist/GenomeAnalysisTK.jar -T TranscriptToInfo -l info -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -B refgene,AnnotatorInputTable,"+options.refgene_dir+"/refGene-converted.txt -o "+options.refgene_dir+"/refGene-big-table-ucsc-%s.txt -L %s:1+ " % (contig, contig)
-        #print(command)
-    if not run_locally:
-        command = "bsub "+EXCLUSIVE+" -q solexa -R \"rusage[mem="+str(MEMORY_USAGE)+"]\" -o "+options.refgene_dir+"/logs/bsub_"+contig+"_log.txt "+command
+    command = "java -Xmx"+str(MEMORY_USAGE)+"g -jar dist/GenomeAnalysisTK.jar -T TranscriptToInfo -l info -R " + reference + " -B transcripts,AnnotatorInputTable,"+transcript_table+" -n "+gene_name_columns+" -o "+ os.path.join(transcript_dir,output_file_prefix) +"-big-table-ucsc-%s.txt -L %s:1+ " % (contig, contig)
     
+
+    if run_locally:
+        command += " > " + os.path.join(logs_dir,contig+"_log.txt")
+    else:
+        command = "bsub "+EXCLUSIVE+" -q " + queue + " -R \"rusage[mem="+str(MEMORY_USAGE)+"]\" -o " + os.path.join(logs_dir,contig+"_log.txt") + " "  + command
+    
+
     
     if run:
         print("Executing: " + command)