From 050d55cdb0e936bf777157d80ee3faaf13f4691a Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 5 Jun 2009 21:04:01 +0000 Subject: [PATCH] Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a --- python/RecalQual.py | 95 ++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/python/RecalQual.py b/python/RecalQual.py index 20f5ea540..94334e2c7 100755 --- a/python/RecalQual.py +++ b/python/RecalQual.py @@ -21,8 +21,9 @@ dbsnp = resources + 'dbsnp.rod.out' # Where are the application files required to run the recalibration gatk = resources + 'gatk/GenomeAnalysisTK.jar' logistic_regression_script = resources + 'logistic_regression.R' +empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R' -import sys,os +import glob,os,sys import LogisticRegressionByReadGroup def exit(msg,errorcode): @@ -33,8 +34,69 @@ def check_input_file_available(filename,description): if not os.access(filename,os.R_OK): exit('Unable to access %s %s' % (description,filename),1) +def graph_file(graph_script,graph_data): + 'Graph the given data using the given script. Leave the data in the output directory.' + check_input_file_available(graph_script,'%s R graphing script' % graph_script) + check_input_file_available(graph_data,'%s graphing data' % graph_data) + result = os.system(' '.join((R_exe,graph_script,graph_data))) + if result != 0: + exit('Unable to graph data: %s' % filename) + +def recalibrate(): + 'Recalibrate the given bam file' + # generate the covariates + print 'generating covariates' + generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) + returncode = os.system(generate_covariates) + if returncode != 0: + exit('Unable to generate covariates',1) + + # compute the logistic regression + print 'computing the logistic regression' + LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script) + + # apply the logistic regression, writing the output data to calibrated_bam + print 'applying the correction to the reads' + apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam)) + returncode = os.system(apply_logistic_regression) + if returncode != 0: + exit('Unable to apply logistic regression',1) + + # index the calibrated bam + print 'indexing the calibrated bam' + index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) + returncode = os.system(index_calibrated_bamfile) + if returncode != 0: + exit('Unable to index calibrated bamfile',1) + + print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam + +def evaluate(): + 'Evaluate recalibration results.' + print 'Evaluating recalibration results' + # regenerate the covariates + regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT output/recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) + print 'regenerating covariates' + returncode = os.system(regenerate_covariates) + if returncode != 0: + exit('Unable to regenerate covariates',1) + + print 'graphing initial results' + for filename in glob.glob('output/initial.*.empirical_v_reported_quality.csv'): + graph_file(empirical_vs_reported_grapher,filename) + print 'graphing final results' + for filename in glob.glob('output/recalibrated.*.empirical_v_reported_quality.csv'): + graph_file(empirical_vs_reported_grapher,filename) + if len(sys.argv) < 3: - exit('Usage: python RecalQual.py ',1) + exit('Usage: python RecalQual.py [{RECALIBRATE | EVALUATE | RECALIBRATE_AND_EVALUATE}]',1) + +operation = 'RECALIBRATE' +if len(sys.argv) == 4: + operation = sys.argv[3] + +if operation not in ['RECALIBRATE','EVALUATE','RECALIBRATE_AND_EVALUATE']: + exit('Operation %s not recognized. Operation must be RECALIBRATE, EVALUATE, or RECALIBRATE_AND_EVALUATE' % operation,1) # check that the input bam file exists, and that the bam is indexed. bam = sys.argv[1] @@ -68,30 +130,9 @@ if not os.path.isdir('output'): # assemble the required program arguments gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO')) -generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) -apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam)) -index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) -# generate the covariates -print 'generating covariates' -returncode = os.system(generate_covariates) -if returncode != 0: - exit('Unable to generate covariates',1) +if operation.find('RECALIBRATE') != -1: + recalibrate() -# compute the logistic regression -print 'computing the logistic regression' -LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script) - -# apply the logistic regression, writing the output data to calibrated_bam -print 'applying the correction to the reads' -returncode = os.system(apply_logistic_regression) -if returncode != 0: - exit('Unable to apply logistic regression',1) - -# index the calibrated bam -print 'indexing the calibrated bam' -returncode = os.system(index_calibrated_bamfile) -if returncode != 0: - exit('Unable to index calibrated bamfile',1) - -print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam +if operation.find('EVALUATE') != -1: + evaluate()