Now supports resume and dry runningRecalQual.py

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@996 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-11 23:31:59 +00:00
parent 4eda040e0f
commit 1fb241a8b8
2 changed files with 96 additions and 38 deletions

View File

@ -15,13 +15,25 @@ def geli2dbsnpFile(geli):
root, flowcellDotlane, ext = picard_utils.splitPath(geli) root, flowcellDotlane, ext = picard_utils.splitPath(geli)
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
def SingleSampleGenotyperCmd(bam, geli, use2bp):
naid = bam.split(".")[0]
metrics = geli + '.metrics'
gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp.rod.out', '--hapmap_chip', hapmapChip, '-calls', geli, '-met', metrics, '-geli -fb -l INFO'])
return cmd
def bams2geli(bams): def bams2geli(bams):
def call1(bam): def call1(bam):
geli = os.path.splitext(bam)[0] + '.geli' geli = os.path.splitext(bam)[0] + '.geli'
jobid = 0 jobid = 0
if not os.path.exists(geli): if OPTIONS.useSSG:
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling()) if not os.path.exists(geli + '.calls'):
cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b)
else:
if not os.path.exists(geli):
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry ) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
return geli, jobid return geli, jobid
calls = map(call1, bams) calls = map(call1, bams)
@ -44,6 +56,12 @@ def main():
parser.add_option("", "--dry", dest="dry", parser.add_option("", "--dry", dest="dry",
action='store_true', default=False, action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run") help="If provided, nothing actually gets run, just a dry run")
parser.add_option("", "--ssg", dest="useSSG",
action='store_true', default=False,
help="If provided, we'll use the GATK SSG for genotyping")
parser.add_option("", "--ssg2b", dest="useSSG2b",
action='store_true', default=False,
help="If provided, we'll use 2bp enabled GATK SSG ")
parser.add_option("-o", "--output", dest="output", parser.add_option("-o", "--output", dest="output",
type="string", default='/dev/stdout', type="string", default='/dev/stdout',
help="x") help="x")

View File

@ -18,6 +18,7 @@ output_root = './'
# Location of the resource files distributed with the recalibration tool. # Location of the resource files distributed with the recalibration tool.
# If editing, please end this variable with a trailing slash. # If editing, please end this variable with a trailing slash.
resources='resources/' resources='resources/'
#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'
import getopt,glob,os,sys import getopt,glob,os,sys
import LogisticRegressionByReadGroup import LogisticRegressionByReadGroup
@ -39,12 +40,29 @@ dbsnp = resources + 'dbsnp.1kg.rod.out'
# Where are the application files required to run the recalibration? # Where are the application files required to run the recalibration?
gatk = resources + 'gatk/GenomeAnalysisTK.jar' gatk = resources + 'gatk/GenomeAnalysisTK.jar'
#gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
logistic_regression_script = resources + 'logistic_regression.R' logistic_regression_script = resources + 'logistic_regression.R'
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R' empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
# Assemble the platform list into command-line arguments. # Assemble the platform list into command-line arguments.
platform_args = ' '.join(['-pl %s' % platform for platform in platforms]) platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
def output_file_needs_update(output_file, input_files = []):
if reprocessFiles: # we are always reprocessing files
return True
if not os.path.exists(output_file): # the file doesn't exist
return True
else:
return False
def executeCommand(cmd):
if not dryRun:
return os.system(cmd)
else:
print ' => Would execute', cmd
return 0
def exit(msg,errorcode): def exit(msg,errorcode):
print msg print msg
sys.exit(errorcode) sys.exit(errorcode)
@ -57,7 +75,7 @@ def graph_file(graph_script,graph_data):
'Graph the given data using the given script. Leave the data in the output directory.' 'Graph the given data using the given script. Leave the data in the output directory.'
check_input_file_available(graph_script,'%s R graphing script' % graph_script) check_input_file_available(graph_script,'%s R graphing script' % graph_script)
check_input_file_available(graph_data,'%s graphing data' % graph_data) check_input_file_available(graph_data,'%s graphing data' % graph_data)
result = os.system(' '.join((R_exe,graph_script,graph_data))) result = executeCommand(' '.join((R_exe,graph_script,graph_data)))
if result != 0: if result != 0:
exit('Unable to graph data: %s' % graph_data,1) exit('Unable to graph data: %s' % graph_data,1)
@ -65,54 +83,70 @@ def recalibrate():
'Recalibrate the given bam file' 'Recalibrate the given bam file'
# generate the covariates # generate the covariates
print 'generating covariates' print 'generating covariates'
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) linear_regression_results = output_dir + 'linear_regression_results.out'
returncode = os.system(generate_covariates) if output_file_needs_update(linear_regression_results):
if returncode != 0: generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
exit('Unable to generate covariates',1) returncode = executeCommand(generate_covariates)
if returncode != 0:
exit('Unable to generate covariates',1)
# compute the logistic regression # compute the logistic regression
print 'computing the logistic regression' if not dryRun:
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',output_dir + 'linear_regression_results.out',R_exe,logistic_regression_script) print 'computing the logistic regression'
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script)
else:
print 'Logistic recalibration data files already generated', linear_regression_results
# apply the logistic regression, writing the output data to calibrated_bam if output_file_needs_update(calibrated_bam):
print 'applying the correction to the reads' # apply the logistic regression, writing the output data to calibrated_bam
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam)) print 'applying the correction to the reads'
returncode = os.system(apply_logistic_regression) apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
if returncode != 0: returncode = executeCommand(apply_logistic_regression)
exit('Unable to apply logistic regression',1) if returncode != 0:
exit('Unable to apply logistic regression',1)
else:
print 'Recalibrated BAM file already generated', calibrated_bam
# index the calibrated bam # index the calibrated bam
print 'indexing the calibrated bam' if output_file_needs_update(calibrated_bam + '.bai'):
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) print 'indexing the calibrated bam'
returncode = os.system(index_calibrated_bamfile) index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
if returncode != 0: returncode = executeCommand(index_calibrated_bamfile)
exit('Unable to index calibrated bamfile',1) if returncode != 0:
exit('Unable to index calibrated bamfile',1)
else:
print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai'
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
def evaluate(): def evaluate():
'Evaluate recalibration results.' 'Evaluate recalibration results.'
print 'Evaluating recalibration results' print 'Evaluating recalibration results'
# regenerate the covariates
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
print 'regenerating covariates'
returncode = os.system(regenerate_covariates)
if returncode != 0:
exit('Unable to regenerate covariates',1)
print 'graphing initial results' recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out'
for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'): if output_file_needs_update(recalibrated_regression_results):
graph_file(empirical_vs_reported_grapher,filename) # regenerate the covariates
print 'graphing final results' regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'): print 'regenerating covariates'
graph_file(empirical_vs_reported_grapher,filename) returncode = executeCommand(regenerate_covariates)
if returncode != 0:
exit('Unable to regenerate covariates',1)
print 'graphing initial results'
for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
graph_file(empirical_vs_reported_grapher,filename)
print 'graphing final results'
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
graph_file(empirical_vs_reported_grapher,filename)
else:
print 'Evaluated files already generated', recalibrated_regression_results
def usage(): def usage():
exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1) exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
# Try to parse the given command-line arguments. # Try to parse the given command-line arguments.
try: try:
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate']) opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry'])
except getopt.GetoptError, err: except getopt.GetoptError, err:
usage() usage()
@ -123,11 +157,17 @@ if len(args) < 2:
# Determine whether to evaluate / recalibrate. # Determine whether to evaluate / recalibrate.
recalibrate_requested = False recalibrate_requested = False
evaluate_requested = False evaluate_requested = False
reprocessFiles = False
dryRun = False
for opt,arg in opts: for opt,arg in opts:
if opt == '--recalibrate': if opt == '--recalibrate':
recalibrate_requested = True recalibrate_requested = True
if opt == '--evaluate': if opt == '--evaluate':
evaluate_requested = True evaluate_requested = True
if opt == '--reprocess':
reprocessFiles = True
if opt == '--dry':
dryRun = True
# Default to 'recalibrate' unless the user specified only evaluate. # Default to 'recalibrate' unless the user specified only evaluate.
do_recalibration = not (evaluate_requested and not recalibrate_requested) do_recalibration = not (evaluate_requested and not recalibrate_requested)