From 61ae00c7bf2796722100d0e4682132b7af1ba018 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 5 Jun 2009 01:26:10 +0000 Subject: [PATCH] Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a --- python/LogisticRegressionByReadGroup.py | 72 ++++++++++++++----------- python/RecalQual.py | 51 ++++++++++++++---- 2 files changed, 80 insertions(+), 43 deletions(-) diff --git a/python/LogisticRegressionByReadGroup.py b/python/LogisticRegressionByReadGroup.py index 70bff487c..b6299d3ab 100755 --- a/python/LogisticRegressionByReadGroup.py +++ b/python/LogisticRegressionByReadGroup.py @@ -2,9 +2,6 @@ import os,sys -R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript" -logistic_regression_script="/humgen/gsa-scr1/hanna/src/StingWorking/R/logistic_regression.R" - def exit(msg,errorlevel): "Exit the program with the specified message and error code." print msg @@ -25,7 +22,7 @@ def read_header(source_file): exit("Input file is in invalid format. First two columns should be ,",1) return header -def create_intermediate_file(source_file,read_group,dinuc): +def create_intermediate_file(source_file,header,read_group,dinuc): "Create an intermediate file for a particular read group / dinuc from a given source file" base = source_file.name[:source_file.name.rfind('.csv')] intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc) @@ -43,7 +40,8 @@ def open_target_file(target_filename): target_file.write("\n") return target_file -def process_file(source_file,read_group,dinuc,target_file): +def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script): + "Process the contents of an intermediate file. An intermediate file is the specific data arranged per read group, per dinuc." base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc)) print "Running " + regression_command @@ -54,37 +52,47 @@ def process_file(source_file,read_group,dinuc,target_file): parameters_file = open(parameters_filename,'r') parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ') target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n') - os.remove('.'.join((base,dinuc,'csv'))) - os.remove('.'.join((base,dinuc,'parameters'))) + # optionally remove the input and output files. For now, keep them around for debugging purposes. + # os.remove('.'.join((base,dinuc,'csv'))) + # os.remove('.'.join((base,dinuc,'parameters'))) -if len(sys.argv) < 3: - exit("Usage: logistic_regression ",1) +def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script): + "Do the heavy lifting. Group covariant data into individual output files, compute the logistic regression, and format the results." + source_file = open_source_file(source_filename) + target_file = open_target_file(target_filename) -source_file = open_source_file(sys.argv[1]) -target_file = open_target_file(sys.argv[2]) + header = read_header(source_file) -header = read_header(source_file) + intermediate_file = None + read_group = None + dinuc = None -intermediate_file = None -read_group = None -dinuc = None + for data_line in source_file: + data_line = data_line.strip() + if len(data_line) == 0: + continue + data = data_line.split(',') + if read_group != data[0] or dinuc != data[1]: + if intermediate_file: + intermediate_file.close() + process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script) + read_group,dinuc = data[0:2] + intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc) + intermediate_file.write(','.join(data[2:])+'\n') -for data_line in source_file: - data_line = data_line.strip() - if len(data_line) == 0: - continue - data = data_line.split(',') - if read_group != data[0] or dinuc != data[1]: - if intermediate_file: - intermediate_file.close() - process_file(source_file,read_group,dinuc,target_file) - read_group,dinuc = data[0:2] - intermediate_file = create_intermediate_file(source_file,read_group,dinuc) - intermediate_file.write(','.join(data[2:])+'\n') + if intermediate_file: + intermediate_file.close() + process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script) -if intermediate_file: - intermediate_file.close() - process_file(source_file,read_group,dinuc,target_file) + source_file.close() + target_file.close() -source_file.close() -target_file.close() +if __name__ == "__main__": + # set up sensible defaults for the locations of R and the logistic regression script + R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript" + logistic_regression_script="resources/logistic_regression.R" + + if len(sys.argv) < 3: + exit("Usage: logistic_regression ",1) + + compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script) diff --git a/python/RecalQual.py b/python/RecalQual.py index 5072bb445..1bfbc6ac2 100755 --- a/python/RecalQual.py +++ b/python/RecalQual.py @@ -3,10 +3,10 @@ # Executable files. Please update these to match the installed locations of your tools. samtools_exe='/seq/dirseq/samtools/current/samtools' java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java' -R_exe='/util/bin/R' -GATK_jar='/humgen/gsa-scr1/hanna/src/StingWorking/dist/GenomeAnalysisTK.jar' +R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript" -# Location of the resource files distributed with the recalibration tool. +# Location of the resource files distributed with the recalibration tool. +# If editing, please end this variable with a trailing slash. resources='resources/' # Where does the reference live? @@ -18,7 +18,12 @@ reference_fai = reference_base + '.fasta.fai' # Where does DBSNP live? dbsnp = resources + 'dbsnp.rod.out' +# Where are the application files required to run the recalibration +gatk = resources + 'gatk/GenomeAnalysisTK.jar' +logistic_regression_script = resources + 'logistic_regression.R' + import sys,os +import LogisticRegressionByReadGroup def exit(msg,errorcode): print msg @@ -50,19 +55,43 @@ check_input_file_available(reference_fai,'reference index file') # check that the dbsnp is available check_input_file_available(dbsnp,'dbsnp file') +# sanity check that the software is available +check_input_file_available(samtools_exe,'samtools') +check_input_file_available(java_exe,'java') +check_input_file_available(R_exe,'R') +check_input_file_available(gatk,'Genome Analysis Toolkit') +check_input_file_available(logistic_regression_script,'logistic regression script') + # make an output directory for temporary files if not os.path.isdir('output'): os.mkdir('output') # assemble the required program arguments -gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',GATK_jar,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM')) - -generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/output','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) -compute_logistic_regression = ' '.join(('python','LogisticRegressionByReadGroup.py','output/output.covariate_counts.csv','output/linear_regression_results.out')) +gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM')) +generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam)) index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) -os.system(generate_covariates) -os.system(compute_logistic_regression) -os.system(apply_logistic_regression) -os.system(index_calibrated_bamfile) +# generate the covariates +print 'generating covariates' +returncode = os.system(generate_covariates) +if returncode != 0: + exit('Unable to generate covariates',1) + +# compute the logistic regression +print 'computing the logistic regression' +LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script) + +# apply the logistic regression, writing the output data to calibrated_bam +print 'applying the correction to the reads' +returncode = os.system(apply_logistic_regression) +if returncode != 0: + exit('Unable to apply logistic regression',1) + +# index the calibrated bam +print 'indexing the calibrated bam' +returncode = os.system(index_calibrated_bamfile) +if returncode != 0: + exit('Unable to index calibrated bamfile',1) + +print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam