2009-06-05 06:52:11 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
|
|
# Executable files. Please update these to match the installed locations of your tools.
|
|
|
|
|
samtools_exe='/seq/dirseq/samtools/current/samtools'
|
|
|
|
|
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
|
2009-06-05 09:26:10 +08:00
|
|
|
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
2009-06-05 06:52:11 +08:00
|
|
|
|
2009-06-05 09:26:10 +08:00
|
|
|
# Location of the resource files distributed with the recalibration tool.
|
|
|
|
|
# If editing, please end this variable with a trailing slash.
|
2009-06-05 06:52:11 +08:00
|
|
|
resources='resources/'
|
|
|
|
|
|
|
|
|
|
# Where does the reference live?
|
|
|
|
|
reference_base = resources + 'Homo_sapiens_assembly18'
|
|
|
|
|
reference = reference_base + '.fasta'
|
|
|
|
|
reference_dict = reference_base + '.dict'
|
|
|
|
|
reference_fai = reference_base + '.fasta.fai'
|
|
|
|
|
|
|
|
|
|
# Where does DBSNP live?
|
|
|
|
|
dbsnp = resources + 'dbsnp.rod.out'
|
|
|
|
|
|
2009-06-05 09:26:10 +08:00
|
|
|
# Where are the application files required to run the recalibration
|
|
|
|
|
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
|
|
|
|
logistic_regression_script = resources + 'logistic_regression.R'
|
2009-06-06 05:04:01 +08:00
|
|
|
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
2009-06-05 09:26:10 +08:00
|
|
|
|
2009-06-06 05:04:01 +08:00
|
|
|
import glob,os,sys
|
2009-06-05 09:26:10 +08:00
|
|
|
import LogisticRegressionByReadGroup
|
2009-06-05 06:52:11 +08:00
|
|
|
|
|
|
|
|
def exit(msg,errorcode):
|
|
|
|
|
print msg
|
|
|
|
|
sys.exit(errorcode)
|
|
|
|
|
|
|
|
|
|
def check_input_file_available(filename,description):
|
|
|
|
|
if not os.access(filename,os.R_OK):
|
|
|
|
|
exit('Unable to access %s %s' % (description,filename),1)
|
|
|
|
|
|
2009-06-06 05:04:01 +08:00
|
|
|
def graph_file(graph_script,graph_data):
|
|
|
|
|
'Graph the given data using the given script. Leave the data in the output directory.'
|
|
|
|
|
check_input_file_available(graph_script,'%s R graphing script' % graph_script)
|
|
|
|
|
check_input_file_available(graph_data,'%s graphing data' % graph_data)
|
|
|
|
|
result = os.system(' '.join((R_exe,graph_script,graph_data)))
|
|
|
|
|
if result != 0:
|
2009-06-06 11:40:50 +08:00
|
|
|
exit('Unable to graph data: %s' % graph_data,1)
|
2009-06-06 05:04:01 +08:00
|
|
|
|
|
|
|
|
def recalibrate():
|
|
|
|
|
'Recalibrate the given bam file'
|
|
|
|
|
# generate the covariates
|
|
|
|
|
print 'generating covariates'
|
|
|
|
|
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
|
|
|
|
returncode = os.system(generate_covariates)
|
|
|
|
|
if returncode != 0:
|
|
|
|
|
exit('Unable to generate covariates',1)
|
|
|
|
|
|
|
|
|
|
# compute the logistic regression
|
|
|
|
|
print 'computing the logistic regression'
|
|
|
|
|
LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script)
|
|
|
|
|
|
|
|
|
|
# apply the logistic regression, writing the output data to calibrated_bam
|
|
|
|
|
print 'applying the correction to the reads'
|
|
|
|
|
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))
|
|
|
|
|
returncode = os.system(apply_logistic_regression)
|
|
|
|
|
if returncode != 0:
|
|
|
|
|
exit('Unable to apply logistic regression',1)
|
|
|
|
|
|
|
|
|
|
# index the calibrated bam
|
|
|
|
|
print 'indexing the calibrated bam'
|
|
|
|
|
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
|
|
|
|
|
returncode = os.system(index_calibrated_bamfile)
|
|
|
|
|
if returncode != 0:
|
|
|
|
|
exit('Unable to index calibrated bamfile',1)
|
|
|
|
|
|
|
|
|
|
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
|
|
|
|
|
|
|
|
|
|
def evaluate():
|
|
|
|
|
'Evaluate recalibration results.'
|
|
|
|
|
print 'Evaluating recalibration results'
|
|
|
|
|
# regenerate the covariates
|
|
|
|
|
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT output/recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
|
|
|
|
print 'regenerating covariates'
|
|
|
|
|
returncode = os.system(regenerate_covariates)
|
|
|
|
|
if returncode != 0:
|
|
|
|
|
exit('Unable to regenerate covariates',1)
|
|
|
|
|
|
|
|
|
|
print 'graphing initial results'
|
|
|
|
|
for filename in glob.glob('output/initial.*.empirical_v_reported_quality.csv'):
|
|
|
|
|
graph_file(empirical_vs_reported_grapher,filename)
|
|
|
|
|
print 'graphing final results'
|
|
|
|
|
for filename in glob.glob('output/recalibrated.*.empirical_v_reported_quality.csv'):
|
|
|
|
|
graph_file(empirical_vs_reported_grapher,filename)
|
|
|
|
|
|
2009-06-05 06:52:11 +08:00
|
|
|
if len(sys.argv) < 3:
|
2009-06-06 05:04:01 +08:00
|
|
|
exit('Usage: python RecalQual.py <input bam file> <calibrated output bam file> [{RECALIBRATE | EVALUATE | RECALIBRATE_AND_EVALUATE}]',1)
|
|
|
|
|
|
|
|
|
|
operation = 'RECALIBRATE'
|
|
|
|
|
if len(sys.argv) == 4:
|
|
|
|
|
operation = sys.argv[3]
|
|
|
|
|
|
|
|
|
|
if operation not in ['RECALIBRATE','EVALUATE','RECALIBRATE_AND_EVALUATE']:
|
|
|
|
|
exit('Operation %s not recognized. Operation must be RECALIBRATE, EVALUATE, or RECALIBRATE_AND_EVALUATE' % operation,1)
|
2009-06-05 06:52:11 +08:00
|
|
|
|
|
|
|
|
# check that the input bam file exists, and that the bam is indexed.
|
|
|
|
|
bam = sys.argv[1]
|
|
|
|
|
bam_index = bam + '.bai'
|
|
|
|
|
|
|
|
|
|
check_input_file_available(bam,'reads file')
|
|
|
|
|
check_input_file_available(bam_index,'reads index file')
|
|
|
|
|
|
|
|
|
|
# parse the user's calibration output file requirements
|
|
|
|
|
calibrated_bam = sys.argv[2]
|
|
|
|
|
calibrated_bam_index = calibrated_bam + '.bai'
|
|
|
|
|
|
|
|
|
|
# check that the fasta and supporting files are available
|
|
|
|
|
check_input_file_available(reference,'reference file')
|
|
|
|
|
check_input_file_available(reference_dict,'reference dictionary')
|
|
|
|
|
check_input_file_available(reference_fai,'reference index file')
|
|
|
|
|
|
|
|
|
|
# check that the dbsnp is available
|
|
|
|
|
check_input_file_available(dbsnp,'dbsnp file')
|
|
|
|
|
|
2009-06-05 09:26:10 +08:00
|
|
|
# sanity check that the software is available
|
|
|
|
|
check_input_file_available(samtools_exe,'samtools')
|
|
|
|
|
check_input_file_available(java_exe,'java')
|
|
|
|
|
check_input_file_available(R_exe,'R')
|
|
|
|
|
check_input_file_available(gatk,'Genome Analysis Toolkit')
|
|
|
|
|
check_input_file_available(logistic_regression_script,'logistic regression script')
|
|
|
|
|
|
2009-06-05 06:52:11 +08:00
|
|
|
# make an output directory for temporary files
|
|
|
|
|
if not os.path.isdir('output'):
|
|
|
|
|
os.mkdir('output')
|
|
|
|
|
|
|
|
|
|
# assemble the required program arguments
|
2009-06-05 09:53:48 +08:00
|
|
|
gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))
|
2009-06-06 05:04:01 +08:00
|
|
|
|
|
|
|
|
if operation.find('RECALIBRATE') != -1:
|
|
|
|
|
recalibrate()
|
|
|
|
|
|
|
|
|
|
if operation.find('EVALUATE') != -1:
|
|
|
|
|
evaluate()
|