From 61ae00c7bf2796722100d0e4682132b7af1ba018 Mon Sep 17 00:00:00 2001
From: hanna <hanna@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 5 Jun 2009 01:26:10 +0000
Subject: [PATCH] Lots of cleanup.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a
---
 python/LogisticRegressionByReadGroup.py | 72 ++++++++++++++-----------
 python/RecalQual.py                     | 51 ++++++++++++++----
 2 files changed, 80 insertions(+), 43 deletions(-)
diff --git a/python/LogisticRegressionByReadGroup.py b/python/LogisticRegressionByReadGroup.py
index 70bff487c..b6299d3ab 100755
--- a/python/LogisticRegressionByReadGroup.py
+++ b/python/LogisticRegressionByReadGroup.py
@@ -2,9 +2,6 @@
 
 import os,sys
 
-R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
-logistic_regression_script="/humgen/gsa-scr1/hanna/src/StingWorking/R/logistic_regression.R"
-
 def exit(msg,errorlevel):
     "Exit the program with the specified message and error code."
     print msg
@@ -25,7 +22,7 @@ def read_header(source_file):
         exit("Input file is in invalid format.  First two columns should be <read group>,<dinucleotide>",1)
     return header
 
-def create_intermediate_file(source_file,read_group,dinuc):
+def create_intermediate_file(source_file,header,read_group,dinuc):
     "Create an intermediate file for a particular read group / dinuc from a given source file"
     base = source_file.name[:source_file.name.rfind('.csv')]
     intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
@@ -43,7 +40,8 @@ def open_target_file(target_filename):
     target_file.write("\n")
     return target_file
 
-def process_file(source_file,read_group,dinuc,target_file):    
+def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):    
+    "Process the contents of an intermediate file.  An intermediate file is the specific data arranged per read group, per dinuc."
     base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
     regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
     print "Running " + regression_command
@@ -54,37 +52,47 @@ def process_file(source_file,read_group,dinuc,target_file):
     parameters_file = open(parameters_filename,'r')
     parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
     target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
-    os.remove('.'.join((base,dinuc,'csv')))
-    os.remove('.'.join((base,dinuc,'parameters')))
+    # optionally remove the input and output files.  For now, keep them around for debugging purposes.
+    # os.remove('.'.join((base,dinuc,'csv')))
+    # os.remove('.'.join((base,dinuc,'parameters')))
     
-if len(sys.argv) < 3:
-    exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
+def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):
+    "Do the heavy lifting.  Group covariant data into individual output files, compute the logistic regression, and format the results."
+    source_file = open_source_file(source_filename)
+    target_file = open_target_file(target_filename)
 
-source_file = open_source_file(sys.argv[1])
-target_file = open_target_file(sys.argv[2])
+    header = read_header(source_file)
 
-header = read_header(source_file)
+    intermediate_file = None
+    read_group = None
+    dinuc = None
 
-intermediate_file = None
-read_group = None
-dinuc = None
+    for data_line in source_file:
+        data_line = data_line.strip()
+        if len(data_line) == 0:
+            continue
+        data = data_line.split(',')
+        if read_group != data[0] or dinuc != data[1]:
+            if intermediate_file:
+                intermediate_file.close()
+                process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
+            read_group,dinuc = data[0:2]
+            intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)
+        intermediate_file.write(','.join(data[2:])+'\n')
 
-for data_line in source_file:
-    data_line = data_line.strip()
-    if len(data_line) == 0:
-        continue
-    data = data_line.split(',')
-    if read_group != data[0] or dinuc != data[1]:
-        if intermediate_file:
-           intermediate_file.close()
-           process_file(source_file,read_group,dinuc,target_file)
-        read_group,dinuc = data[0:2]
-        intermediate_file = create_intermediate_file(source_file,read_group,dinuc)
-    intermediate_file.write(','.join(data[2:])+'\n')
+    if intermediate_file:
+        intermediate_file.close()
+        process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
 
-if intermediate_file:
-    intermediate_file.close()
-    process_file(source_file,read_group,dinuc,target_file)
+    source_file.close()
+    target_file.close()
 
-source_file.close()
-target_file.close()
+if __name__ == "__main__":
+    # set up sensible defaults for the locations of R and the logistic regression script
+    R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
+    logistic_regression_script="resources/logistic_regression.R"
+
+    if len(sys.argv) < 3:
+        exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
+
+    compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)
diff --git a/python/RecalQual.py b/python/RecalQual.py
index 5072bb445..1bfbc6ac2 100755
--- a/python/RecalQual.py
+++ b/python/RecalQual.py
@@ -3,10 +3,10 @@
 # Executable files.  Please update these to match the installed locations of your tools.
 samtools_exe='/seq/dirseq/samtools/current/samtools'
 java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
-R_exe='/util/bin/R'
-GATK_jar='/humgen/gsa-scr1/hanna/src/StingWorking/dist/GenomeAnalysisTK.jar'
+R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
 
-# Location of the resource files distributed with the recalibration tool.
+# Location of the resource files distributed with the recalibration tool.  
+# If editing, please end this variable with a trailing slash.
 resources='resources/'
 
 # Where does the reference live?
@@ -18,7 +18,12 @@ reference_fai  = reference_base + '.fasta.fai'
 # Where does DBSNP live?
 dbsnp = resources + 'dbsnp.rod.out'
 
+# Where are the application files required to run the recalibration
+gatk = resources + 'gatk/GenomeAnalysisTK.jar'
+logistic_regression_script = resources + 'logistic_regression.R'
+
 import sys,os
+import LogisticRegressionByReadGroup
 
 def exit(msg,errorcode):
     print msg
@@ -50,19 +55,43 @@ check_input_file_available(reference_fai,'reference index file')
 # check that the dbsnp is available
 check_input_file_available(dbsnp,'dbsnp file')
 
+# sanity check that the software is available
+check_input_file_available(samtools_exe,'samtools')
+check_input_file_available(java_exe,'java')
+check_input_file_available(R_exe,'R')
+check_input_file_available(gatk,'Genome Analysis Toolkit')
+check_input_file_available(logistic_regression_script,'logistic regression script')
+
 # make an output directory for temporary files
 if not os.path.isdir('output'):
     os.mkdir('output')
 
 # assemble the required program arguments
-gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',GATK_jar,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM'))
-
-generate_covariates         = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/output','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
-compute_logistic_regression = ' '.join(('python','LogisticRegressionByReadGroup.py','output/output.covariate_counts.csv','output/linear_regression_results.out'))
+gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM'))
+generate_covariates         = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
 apply_logistic_regression   = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))
 index_calibrated_bamfile    = ' '.join((samtools_exe,'index',calibrated_bam))
 
-os.system(generate_covariates)
-os.system(compute_logistic_regression)
-os.system(apply_logistic_regression)
-os.system(index_calibrated_bamfile)
+# generate the covariates
+print 'generating covariates'
+returncode = os.system(generate_covariates)
+if returncode != 0:
+    exit('Unable to generate covariates',1)
+
+# compute the logistic regression
+print 'computing the logistic regression'
+LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script)
+
+# apply the logistic regression, writing the output data to calibrated_bam
+print 'applying the correction to the reads'
+returncode = os.system(apply_logistic_regression)
+if returncode != 0:
+    exit('Unable to apply logistic regression',1)
+
+# index the calibrated bam
+print 'indexing the calibrated bam'
+returncode = os.system(index_calibrated_bamfile)
+if returncode != 0:
+    exit('Unable to index calibrated bamfile',1)
+
+print 'Recalibration complete!  Calibrated bam is available here: ' + calibrated_bam