gatk-3.8/python/RecalQual.py

#!/usr/bin/env python

# Executable files.  Please update these to match the installed locations of your tools.
samtools_exe='/seq/dirseq/samtools/current/samtools'
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"

# Location of the resource files distributed with the recalibration tool.  
# If editing, please end this variable with a trailing slash.
resources='resources/'

# Where does the reference live?
reference_base = resources + 'Homo_sapiens_assembly18'
reference      = reference_base + '.fasta'
reference_dict = reference_base + '.dict'
reference_fai  = reference_base + '.fasta.fai'

# Where does DBSNP live?
dbsnp = resources + 'dbsnp.rod.out'

# Where are the application files required to run the recalibration
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
logistic_regression_script = resources + 'logistic_regression.R'
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'

import glob,os,sys
import LogisticRegressionByReadGroup

def exit(msg,errorcode):
    print msg
    sys.exit(errorcode)

def check_input_file_available(filename,description):
    if not os.access(filename,os.R_OK):
        exit('Unable to access %s %s' % (description,filename),1)

def graph_file(graph_script,graph_data):
    'Graph the given data using the given script.  Leave the data in the output directory.'
    check_input_file_available(graph_script,'%s R graphing script' % graph_script)
    check_input_file_available(graph_data,'%s graphing data' % graph_data)
    result = os.system(' '.join((R_exe,graph_script,graph_data)))
    if result != 0:
        exit('Unable to graph data: %s' % graph_data,1)

def recalibrate():
    'Recalibrate the given bam file'
    # generate the covariates
    print 'generating covariates'
    generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
    returncode = os.system(generate_covariates)
    if returncode != 0:
        exit('Unable to generate covariates',1)

    # compute the logistic regression
    print 'computing the logistic regression'
    LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script)

    # apply the logistic regression, writing the output data to calibrated_bam
    print 'applying the correction to the reads'
    apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))
    returncode = os.system(apply_logistic_regression)
    if returncode != 0:
        exit('Unable to apply logistic regression',1)

    # index the calibrated bam
    print 'indexing the calibrated bam'
    index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
    returncode = os.system(index_calibrated_bamfile)
    if returncode != 0:
        exit('Unable to index calibrated bamfile',1)

    print 'Recalibration complete!  Calibrated bam is available here: ' + calibrated_bam    

def evaluate():
    'Evaluate recalibration results.'
    print 'Evaluating recalibration results'
    # regenerate the covariates
    regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT output/recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
    print 'regenerating covariates'
    returncode = os.system(regenerate_covariates)
    if returncode != 0:
        exit('Unable to regenerate covariates',1)

    print 'graphing initial results'
    for filename in glob.glob('output/initial.*.empirical_v_reported_quality.csv'):
        graph_file(empirical_vs_reported_grapher,filename)
    print 'graphing final results'
    for filename in glob.glob('output/recalibrated.*.empirical_v_reported_quality.csv'):
        graph_file(empirical_vs_reported_grapher,filename)

if len(sys.argv) < 3:
    exit('Usage: python RecalQual.py <input bam file> <calibrated output bam file> [{RECALIBRATE | EVALUATE | RECALIBRATE_AND_EVALUATE}]',1)

operation = 'RECALIBRATE'
if len(sys.argv) == 4:
    operation = sys.argv[3]

if operation not in ['RECALIBRATE','EVALUATE','RECALIBRATE_AND_EVALUATE']:
    exit('Operation %s not recognized.  Operation must be RECALIBRATE, EVALUATE, or RECALIBRATE_AND_EVALUATE' % operation,1)

# check that the input bam file exists, and that the bam is indexed.
bam = sys.argv[1]
bam_index = bam + '.bai'

check_input_file_available(bam,'reads file')
check_input_file_available(bam_index,'reads index file')

# parse the user's calibration output file requirements
calibrated_bam = sys.argv[2]
calibrated_bam_index = calibrated_bam + '.bai'

# check that the fasta and supporting files are available
check_input_file_available(reference,'reference file')
check_input_file_available(reference_dict,'reference dictionary')
check_input_file_available(reference_fai,'reference index file')

# check that the dbsnp is available
check_input_file_available(dbsnp,'dbsnp file')

# sanity check that the software is available
check_input_file_available(samtools_exe,'samtools')
check_input_file_available(java_exe,'java')
check_input_file_available(R_exe,'R')
check_input_file_available(gatk,'Genome Analysis Toolkit')
check_input_file_available(logistic_regression_script,'logistic regression script')

# make an output directory for temporary files
if not os.path.isdir('output'):
    os.mkdir('output')

# assemble the required program arguments
gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))

if operation.find('RECALIBRATE') != -1:
    recalibrate()

if operation.find('EVALUATE') != -1:
    evaluate()
Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00			`#!/usr/bin/env python`

			`# Executable files. Please update these to match the installed locations of your tools.`
			`samtools_exe='/seq/dirseq/samtools/current/samtools'`
			`java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'`
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"`
Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`# Location of the resource files distributed with the recalibration tool.`
			`# If editing, please end this variable with a trailing slash.`
Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00			`resources='resources/'`

			`# Where does the reference live?`
			`reference_base = resources + 'Homo_sapiens_assembly18'`
			`reference = reference_base + '.fasta'`
			`reference_dict = reference_base + '.dict'`
			`reference_fai = reference_base + '.fasta.fai'`

			`# Where does DBSNP live?`
			`dbsnp = resources + 'dbsnp.rod.out'`

Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`# Where are the application files required to run the recalibration`
			`gatk = resources + 'gatk/GenomeAnalysisTK.jar'`
			`logistic_regression_script = resources + 'logistic_regression.R'`
Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 05:04:01 +08:00			`empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'`
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00
Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 05:04:01 +08:00			`import glob,os,sys`
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`import LogisticRegressionByReadGroup`
Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00
			`def exit(msg,errorcode):`
			`print msg`
			`sys.exit(errorcode)`

			`def check_input_file_available(filename,description):`
			`if not os.access(filename,os.R_OK):`
			`exit('Unable to access %s %s' % (description,filename),1)`

Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 05:04:01 +08:00			`def graph_file(graph_script,graph_data):`
			`'Graph the given data using the given script. Leave the data in the output directory.'`
			`check_input_file_available(graph_script,'%s R graphing script' % graph_script)`
			`check_input_file_available(graph_data,'%s graphing data' % graph_data)`
			`result = os.system(' '.join((R_exe,graph_script,graph_data)))`
			`if result != 0:`
Tighten up error handling a bit. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@920 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 11:40:50 +08:00			`exit('Unable to graph data: %s' % graph_data,1)`
Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 05:04:01 +08:00
			`def recalibrate():`
			`'Recalibrate the given bam file'`
			`# generate the covariates`
			`print 'generating covariates'`
			`generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))`
			`returncode = os.system(generate_covariates)`
			`if returncode != 0:`
			`exit('Unable to generate covariates',1)`

			`# compute the logistic regression`
			`print 'computing the logistic regression'`
			`LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script)`

			`# apply the logistic regression, writing the output data to calibrated_bam`
			`print 'applying the correction to the reads'`
			`apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))`
			`returncode = os.system(apply_logistic_regression)`
			`if returncode != 0:`
			`exit('Unable to apply logistic regression',1)`

			`# index the calibrated bam`
			`print 'indexing the calibrated bam'`
			`index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))`
			`returncode = os.system(index_calibrated_bamfile)`
			`if returncode != 0:`
			`exit('Unable to index calibrated bamfile',1)`

			`print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam`

			`def evaluate():`
			`'Evaluate recalibration results.'`
			`print 'Evaluating recalibration results'`
			`# regenerate the covariates`
			`regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT output/recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))`
			`print 'regenerating covariates'`
			`returncode = os.system(regenerate_covariates)`
			`if returncode != 0:`
			`exit('Unable to regenerate covariates',1)`

			`print 'graphing initial results'`
			`for filename in glob.glob('output/initial.*.empirical_v_reported_quality.csv'):`
			`graph_file(empirical_vs_reported_grapher,filename)`
			`print 'graphing final results'`
			`for filename in glob.glob('output/recalibrated.*.empirical_v_reported_quality.csv'):`
			`graph_file(empirical_vs_reported_grapher,filename)`

Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00			`if len(sys.argv) < 3:`
Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 05:04:01 +08:00			`exit('Usage: python RecalQual.py <input bam file> <calibrated output bam file> [{RECALIBRATE \| EVALUATE \| RECALIBRATE_AND_EVALUATE}]',1)`

			`operation = 'RECALIBRATE'`
			`if len(sys.argv) == 4:`
			`operation = sys.argv[3]`

			`if operation not in ['RECALIBRATE','EVALUATE','RECALIBRATE_AND_EVALUATE']:`
			`exit('Operation %s not recognized. Operation must be RECALIBRATE, EVALUATE, or RECALIBRATE_AND_EVALUATE' % operation,1)`
Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00
			`# check that the input bam file exists, and that the bam is indexed.`
			`bam = sys.argv[1]`
			`bam_index = bam + '.bai'`

			`check_input_file_available(bam,'reads file')`
			`check_input_file_available(bam_index,'reads index file')`

			`# parse the user's calibration output file requirements`
			`calibrated_bam = sys.argv[2]`
			`calibrated_bam_index = calibrated_bam + '.bai'`

			`# check that the fasta and supporting files are available`
			`check_input_file_available(reference,'reference file')`
			`check_input_file_available(reference_dict,'reference dictionary')`
			`check_input_file_available(reference_fai,'reference index file')`

			`# check that the dbsnp is available`
			`check_input_file_available(dbsnp,'dbsnp file')`

Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`# sanity check that the software is available`
			`check_input_file_available(samtools_exe,'samtools')`
			`check_input_file_available(java_exe,'java')`
			`check_input_file_available(R_exe,'R')`
			`check_input_file_available(gatk,'Genome Analysis Toolkit')`
			`check_input_file_available(logistic_regression_script,'logistic regression script')`

Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 06:52:11 +08:00			`# make an output directory for temporary files`
			`if not os.path.isdir('output'):`
			`os.mkdir('output')`

			`# assemble the required program arguments`
Revert some debug code in RecalQual.py. Make LogisticRegression easier to Ctrl-C out of. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@904 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:53:48 +08:00			`gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))`
Basic graph support for testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@916 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-06 05:04:01 +08:00
			`if operation.find('RECALIBRATE') != -1:`
			`recalibrate()`

			`if operation.find('EVALUATE') != -1:`
			`evaluate()`