Now supports resume and dry runningRecalQual.py
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@996 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4eda040e0f
commit
1fb241a8b8
|
|
@ -15,11 +15,23 @@ def geli2dbsnpFile(geli):
|
||||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||||
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||||
|
|
||||||
|
def SingleSampleGenotyperCmd(bam, geli, use2bp):
|
||||||
|
naid = bam.split(".")[0]
|
||||||
|
metrics = geli + '.metrics'
|
||||||
|
gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
|
||||||
|
hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
|
||||||
|
targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
|
||||||
|
cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp.rod.out', '--hapmap_chip', hapmapChip, '-calls', geli, '-met', metrics, '-geli -fb -l INFO'])
|
||||||
|
return cmd
|
||||||
|
|
||||||
def bams2geli(bams):
|
def bams2geli(bams):
|
||||||
def call1(bam):
|
def call1(bam):
|
||||||
geli = os.path.splitext(bam)[0] + '.geli'
|
geli = os.path.splitext(bam)[0] + '.geli'
|
||||||
jobid = 0
|
jobid = 0
|
||||||
|
if OPTIONS.useSSG:
|
||||||
|
if not os.path.exists(geli + '.calls'):
|
||||||
|
cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b)
|
||||||
|
else:
|
||||||
if not os.path.exists(geli):
|
if not os.path.exists(geli):
|
||||||
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
|
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
|
||||||
|
|
@ -44,6 +56,12 @@ def main():
|
||||||
parser.add_option("", "--dry", dest="dry",
|
parser.add_option("", "--dry", dest="dry",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="If provided, nothing actually gets run, just a dry run")
|
help="If provided, nothing actually gets run, just a dry run")
|
||||||
|
parser.add_option("", "--ssg", dest="useSSG",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, we'll use the GATK SSG for genotyping")
|
||||||
|
parser.add_option("", "--ssg2b", dest="useSSG2b",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, we'll use 2bp enabled GATK SSG ")
|
||||||
parser.add_option("-o", "--output", dest="output",
|
parser.add_option("-o", "--output", dest="output",
|
||||||
type="string", default='/dev/stdout',
|
type="string", default='/dev/stdout',
|
||||||
help="x")
|
help="x")
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@ output_root = './'
|
||||||
# Location of the resource files distributed with the recalibration tool.
|
# Location of the resource files distributed with the recalibration tool.
|
||||||
# If editing, please end this variable with a trailing slash.
|
# If editing, please end this variable with a trailing slash.
|
||||||
resources='resources/'
|
resources='resources/'
|
||||||
|
#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'
|
||||||
|
|
||||||
import getopt,glob,os,sys
|
import getopt,glob,os,sys
|
||||||
import LogisticRegressionByReadGroup
|
import LogisticRegressionByReadGroup
|
||||||
|
|
@ -39,12 +40,29 @@ dbsnp = resources + 'dbsnp.1kg.rod.out'
|
||||||
|
|
||||||
# Where are the application files required to run the recalibration?
|
# Where are the application files required to run the recalibration?
|
||||||
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||||
|
#gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
|
||||||
|
|
||||||
logistic_regression_script = resources + 'logistic_regression.R'
|
logistic_regression_script = resources + 'logistic_regression.R'
|
||||||
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
||||||
|
|
||||||
# Assemble the platform list into command-line arguments.
|
# Assemble the platform list into command-line arguments.
|
||||||
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
|
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
|
||||||
|
|
||||||
|
def output_file_needs_update(output_file, input_files = []):
|
||||||
|
if reprocessFiles: # we are always reprocessing files
|
||||||
|
return True
|
||||||
|
if not os.path.exists(output_file): # the file doesn't exist
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def executeCommand(cmd):
|
||||||
|
if not dryRun:
|
||||||
|
return os.system(cmd)
|
||||||
|
else:
|
||||||
|
print ' => Would execute', cmd
|
||||||
|
return 0
|
||||||
|
|
||||||
def exit(msg,errorcode):
|
def exit(msg,errorcode):
|
||||||
print msg
|
print msg
|
||||||
sys.exit(errorcode)
|
sys.exit(errorcode)
|
||||||
|
|
@ -57,7 +75,7 @@ def graph_file(graph_script,graph_data):
|
||||||
'Graph the given data using the given script. Leave the data in the output directory.'
|
'Graph the given data using the given script. Leave the data in the output directory.'
|
||||||
check_input_file_available(graph_script,'%s R graphing script' % graph_script)
|
check_input_file_available(graph_script,'%s R graphing script' % graph_script)
|
||||||
check_input_file_available(graph_data,'%s graphing data' % graph_data)
|
check_input_file_available(graph_data,'%s graphing data' % graph_data)
|
||||||
result = os.system(' '.join((R_exe,graph_script,graph_data)))
|
result = executeCommand(' '.join((R_exe,graph_script,graph_data)))
|
||||||
if result != 0:
|
if result != 0:
|
||||||
exit('Unable to graph data: %s' % graph_data,1)
|
exit('Unable to graph data: %s' % graph_data,1)
|
||||||
|
|
||||||
|
|
@ -65,38 +83,52 @@ def recalibrate():
|
||||||
'Recalibrate the given bam file'
|
'Recalibrate the given bam file'
|
||||||
# generate the covariates
|
# generate the covariates
|
||||||
print 'generating covariates'
|
print 'generating covariates'
|
||||||
|
linear_regression_results = output_dir + 'linear_regression_results.out'
|
||||||
|
if output_file_needs_update(linear_regression_results):
|
||||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||||
returncode = os.system(generate_covariates)
|
returncode = executeCommand(generate_covariates)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to generate covariates',1)
|
exit('Unable to generate covariates',1)
|
||||||
|
|
||||||
# compute the logistic regression
|
# compute the logistic regression
|
||||||
|
if not dryRun:
|
||||||
print 'computing the logistic regression'
|
print 'computing the logistic regression'
|
||||||
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',output_dir + 'linear_regression_results.out',R_exe,logistic_regression_script)
|
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script)
|
||||||
|
else:
|
||||||
|
print 'Logistic recalibration data files already generated', linear_regression_results
|
||||||
|
|
||||||
|
if output_file_needs_update(calibrated_bam):
|
||||||
# apply the logistic regression, writing the output data to calibrated_bam
|
# apply the logistic regression, writing the output data to calibrated_bam
|
||||||
print 'applying the correction to the reads'
|
print 'applying the correction to the reads'
|
||||||
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
|
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
|
||||||
returncode = os.system(apply_logistic_regression)
|
returncode = executeCommand(apply_logistic_regression)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to apply logistic regression',1)
|
exit('Unable to apply logistic regression',1)
|
||||||
|
else:
|
||||||
|
print 'Recalibrated BAM file already generated', calibrated_bam
|
||||||
|
|
||||||
# index the calibrated bam
|
# index the calibrated bam
|
||||||
|
if output_file_needs_update(calibrated_bam + '.bai'):
|
||||||
print 'indexing the calibrated bam'
|
print 'indexing the calibrated bam'
|
||||||
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
|
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
|
||||||
returncode = os.system(index_calibrated_bamfile)
|
returncode = executeCommand(index_calibrated_bamfile)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to index calibrated bamfile',1)
|
exit('Unable to index calibrated bamfile',1)
|
||||||
|
else:
|
||||||
|
print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai'
|
||||||
|
|
||||||
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
|
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
|
||||||
|
|
||||||
def evaluate():
|
def evaluate():
|
||||||
'Evaluate recalibration results.'
|
'Evaluate recalibration results.'
|
||||||
print 'Evaluating recalibration results'
|
print 'Evaluating recalibration results'
|
||||||
|
|
||||||
|
recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out'
|
||||||
|
if output_file_needs_update(recalibrated_regression_results):
|
||||||
# regenerate the covariates
|
# regenerate the covariates
|
||||||
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||||
print 'regenerating covariates'
|
print 'regenerating covariates'
|
||||||
returncode = os.system(regenerate_covariates)
|
returncode = executeCommand(regenerate_covariates)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to regenerate covariates',1)
|
exit('Unable to regenerate covariates',1)
|
||||||
|
|
||||||
|
|
@ -106,13 +138,15 @@ def evaluate():
|
||||||
print 'graphing final results'
|
print 'graphing final results'
|
||||||
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
|
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
|
||||||
graph_file(empirical_vs_reported_grapher,filename)
|
graph_file(empirical_vs_reported_grapher,filename)
|
||||||
|
else:
|
||||||
|
print 'Evaluated files already generated', recalibrated_regression_results
|
||||||
|
|
||||||
def usage():
|
def usage():
|
||||||
exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
|
exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
|
||||||
|
|
||||||
# Try to parse the given command-line arguments.
|
# Try to parse the given command-line arguments.
|
||||||
try:
|
try:
|
||||||
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate'])
|
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry'])
|
||||||
except getopt.GetoptError, err:
|
except getopt.GetoptError, err:
|
||||||
usage()
|
usage()
|
||||||
|
|
||||||
|
|
@ -123,11 +157,17 @@ if len(args) < 2:
|
||||||
# Determine whether to evaluate / recalibrate.
|
# Determine whether to evaluate / recalibrate.
|
||||||
recalibrate_requested = False
|
recalibrate_requested = False
|
||||||
evaluate_requested = False
|
evaluate_requested = False
|
||||||
|
reprocessFiles = False
|
||||||
|
dryRun = False
|
||||||
for opt,arg in opts:
|
for opt,arg in opts:
|
||||||
if opt == '--recalibrate':
|
if opt == '--recalibrate':
|
||||||
recalibrate_requested = True
|
recalibrate_requested = True
|
||||||
if opt == '--evaluate':
|
if opt == '--evaluate':
|
||||||
evaluate_requested = True
|
evaluate_requested = True
|
||||||
|
if opt == '--reprocess':
|
||||||
|
reprocessFiles = True
|
||||||
|
if opt == '--dry':
|
||||||
|
dryRun = True
|
||||||
|
|
||||||
# Default to 'recalibrate' unless the user specified only evaluate.
|
# Default to 'recalibrate' unless the user specified only evaluate.
|
||||||
do_recalibration = not (evaluate_requested and not recalibrate_requested)
|
do_recalibration = not (evaluate_requested and not recalibrate_requested)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue