Lots of cleanup.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9689bb3331
commit
61ae00c7bf
|
|
@ -2,9 +2,6 @@
|
|||
|
||||
import os,sys
|
||||
|
||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||
logistic_regression_script="/humgen/gsa-scr1/hanna/src/StingWorking/R/logistic_regression.R"
|
||||
|
||||
def exit(msg,errorlevel):
|
||||
"Exit the program with the specified message and error code."
|
||||
print msg
|
||||
|
|
@ -25,7 +22,7 @@ def read_header(source_file):
|
|||
exit("Input file is in invalid format. First two columns should be <read group>,<dinucleotide>",1)
|
||||
return header
|
||||
|
||||
def create_intermediate_file(source_file,read_group,dinuc):
|
||||
def create_intermediate_file(source_file,header,read_group,dinuc):
|
||||
"Create an intermediate file for a particular read group / dinuc from a given source file"
|
||||
base = source_file.name[:source_file.name.rfind('.csv')]
|
||||
intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
|
||||
|
|
@ -43,7 +40,8 @@ def open_target_file(target_filename):
|
|||
target_file.write("\n")
|
||||
return target_file
|
||||
|
||||
def process_file(source_file,read_group,dinuc,target_file):
|
||||
def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):
|
||||
"Process the contents of an intermediate file. An intermediate file is the specific data arranged per read group, per dinuc."
|
||||
base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
|
||||
regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
|
||||
print "Running " + regression_command
|
||||
|
|
@ -54,37 +52,47 @@ def process_file(source_file,read_group,dinuc,target_file):
|
|||
parameters_file = open(parameters_filename,'r')
|
||||
parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
|
||||
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
|
||||
os.remove('.'.join((base,dinuc,'csv')))
|
||||
os.remove('.'.join((base,dinuc,'parameters')))
|
||||
# optionally remove the input and output files. For now, keep them around for debugging purposes.
|
||||
# os.remove('.'.join((base,dinuc,'csv')))
|
||||
# os.remove('.'.join((base,dinuc,'parameters')))
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
|
||||
def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):
|
||||
"Do the heavy lifting. Group covariant data into individual output files, compute the logistic regression, and format the results."
|
||||
source_file = open_source_file(source_filename)
|
||||
target_file = open_target_file(target_filename)
|
||||
|
||||
source_file = open_source_file(sys.argv[1])
|
||||
target_file = open_target_file(sys.argv[2])
|
||||
header = read_header(source_file)
|
||||
|
||||
header = read_header(source_file)
|
||||
intermediate_file = None
|
||||
read_group = None
|
||||
dinuc = None
|
||||
|
||||
intermediate_file = None
|
||||
read_group = None
|
||||
dinuc = None
|
||||
for data_line in source_file:
|
||||
data_line = data_line.strip()
|
||||
if len(data_line) == 0:
|
||||
continue
|
||||
data = data_line.split(',')
|
||||
if read_group != data[0] or dinuc != data[1]:
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
|
||||
read_group,dinuc = data[0:2]
|
||||
intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)
|
||||
intermediate_file.write(','.join(data[2:])+'\n')
|
||||
|
||||
for data_line in source_file:
|
||||
data_line = data_line.strip()
|
||||
if len(data_line) == 0:
|
||||
continue
|
||||
data = data_line.split(',')
|
||||
if read_group != data[0] or dinuc != data[1]:
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file)
|
||||
read_group,dinuc = data[0:2]
|
||||
intermediate_file = create_intermediate_file(source_file,read_group,dinuc)
|
||||
intermediate_file.write(','.join(data[2:])+'\n')
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
|
||||
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file)
|
||||
source_file.close()
|
||||
target_file.close()
|
||||
|
||||
source_file.close()
|
||||
target_file.close()
|
||||
if __name__ == "__main__":
|
||||
# set up sensible defaults for the locations of R and the logistic regression script
|
||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||
logistic_regression_script="resources/logistic_regression.R"
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
|
||||
|
||||
compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
# Executable files. Please update these to match the installed locations of your tools.
|
||||
samtools_exe='/seq/dirseq/samtools/current/samtools'
|
||||
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
|
||||
R_exe='/util/bin/R'
|
||||
GATK_jar='/humgen/gsa-scr1/hanna/src/StingWorking/dist/GenomeAnalysisTK.jar'
|
||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||
|
||||
# Location of the resource files distributed with the recalibration tool.
|
||||
# Location of the resource files distributed with the recalibration tool.
|
||||
# If editing, please end this variable with a trailing slash.
|
||||
resources='resources/'
|
||||
|
||||
# Where does the reference live?
|
||||
|
|
@ -18,7 +18,12 @@ reference_fai = reference_base + '.fasta.fai'
|
|||
# Where does DBSNP live?
|
||||
dbsnp = resources + 'dbsnp.rod.out'
|
||||
|
||||
# Where are the application files required to run the recalibration
|
||||
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||
logistic_regression_script = resources + 'logistic_regression.R'
|
||||
|
||||
import sys,os
|
||||
import LogisticRegressionByReadGroup
|
||||
|
||||
def exit(msg,errorcode):
|
||||
print msg
|
||||
|
|
@ -50,19 +55,43 @@ check_input_file_available(reference_fai,'reference index file')
|
|||
# check that the dbsnp is available
|
||||
check_input_file_available(dbsnp,'dbsnp file')
|
||||
|
||||
# sanity check that the software is available
|
||||
check_input_file_available(samtools_exe,'samtools')
|
||||
check_input_file_available(java_exe,'java')
|
||||
check_input_file_available(R_exe,'R')
|
||||
check_input_file_available(gatk,'Genome Analysis Toolkit')
|
||||
check_input_file_available(logistic_regression_script,'logistic regression script')
|
||||
|
||||
# make an output directory for temporary files
|
||||
if not os.path.isdir('output'):
|
||||
os.mkdir('output')
|
||||
|
||||
# assemble the required program arguments
|
||||
gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',GATK_jar,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM'))
|
||||
|
||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/output','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||
compute_logistic_regression = ' '.join(('python','LogisticRegressionByReadGroup.py','output/output.covariate_counts.csv','output/linear_regression_results.out'))
|
||||
gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM'))
|
||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))
|
||||
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
|
||||
|
||||
os.system(generate_covariates)
|
||||
os.system(compute_logistic_regression)
|
||||
os.system(apply_logistic_regression)
|
||||
os.system(index_calibrated_bamfile)
|
||||
# generate the covariates
|
||||
print 'generating covariates'
|
||||
returncode = os.system(generate_covariates)
|
||||
if returncode != 0:
|
||||
exit('Unable to generate covariates',1)
|
||||
|
||||
# compute the logistic regression
|
||||
print 'computing the logistic regression'
|
||||
LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script)
|
||||
|
||||
# apply the logistic regression, writing the output data to calibrated_bam
|
||||
print 'applying the correction to the reads'
|
||||
returncode = os.system(apply_logistic_regression)
|
||||
if returncode != 0:
|
||||
exit('Unable to apply logistic regression',1)
|
||||
|
||||
# index the calibrated bam
|
||||
print 'indexing the calibrated bam'
|
||||
returncode = os.system(index_calibrated_bamfile)
|
||||
if returncode != 0:
|
||||
exit('Unable to index calibrated bamfile',1)
|
||||
|
||||
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
|
||||
|
|
|
|||
Loading…
Reference in New Issue