diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py index 61e479969..77ccfe1c2 100755 --- a/python/Gelis2PopSNPs.py +++ b/python/Gelis2PopSNPs.py @@ -15,13 +15,25 @@ def geli2dbsnpFile(geli): root, flowcellDotlane, ext = picard_utils.splitPath(geli) return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' - +def SingleSampleGenotyperCmd(bam, geli, use2bp): + naid = bam.split(".")[0] + metrics = geli + '.metrics' + gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar' + hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff' + targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list' + cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp.rod.out', '--hapmap_chip', hapmapChip, '-calls', geli, '-met', metrics, '-geli -fb -l INFO']) + return cmd + def bams2geli(bams): def call1(bam): geli = os.path.splitext(bam)[0] + '.geli' jobid = 0 - if not os.path.exists(geli): - cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling()) + if OPTIONS.useSSG: + if not os.path.exists(geli + '.calls'): + cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b) + else: + if not os.path.exists(geli): + cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling()) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry ) return geli, jobid calls = map(call1, bams) @@ -44,6 +56,12 @@ def main(): parser.add_option("", "--dry", dest="dry", action='store_true', default=False, help="If provided, nothing actually gets run, just a dry run") + parser.add_option("", "--ssg", dest="useSSG", + action='store_true', default=False, + help="If provided, we'll use the GATK SSG for genotyping") + parser.add_option("", "--ssg2b", dest="useSSG2b", + action='store_true', default=False, + help="If provided, we'll use 2bp enabled GATK SSG ") parser.add_option("-o", "--output", dest="output", type="string", default='/dev/stdout', help="x") diff --git a/python/RecalQual.py b/python/RecalQual.py index a8bbace27..ae5ffa78a 100755 --- a/python/RecalQual.py +++ b/python/RecalQual.py @@ -18,6 +18,7 @@ output_root = './' # Location of the resource files distributed with the recalibration tool. # If editing, please end this variable with a trailing slash. resources='resources/' +#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/' import getopt,glob,os,sys import LogisticRegressionByReadGroup @@ -39,12 +40,29 @@ dbsnp = resources + 'dbsnp.1kg.rod.out' # Where are the application files required to run the recalibration? gatk = resources + 'gatk/GenomeAnalysisTK.jar' +#gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar' + logistic_regression_script = resources + 'logistic_regression.R' empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R' # Assemble the platform list into command-line arguments. platform_args = ' '.join(['-pl %s' % platform for platform in platforms]) +def output_file_needs_update(output_file, input_files = []): + if reprocessFiles: # we are always reprocessing files + return True + if not os.path.exists(output_file): # the file doesn't exist + return True + else: + return False + +def executeCommand(cmd): + if not dryRun: + return os.system(cmd) + else: + print ' => Would execute', cmd + return 0 + def exit(msg,errorcode): print msg sys.exit(errorcode) @@ -57,7 +75,7 @@ def graph_file(graph_script,graph_data): 'Graph the given data using the given script. Leave the data in the output directory.' check_input_file_available(graph_script,'%s R graphing script' % graph_script) check_input_file_available(graph_data,'%s graphing data' % graph_data) - result = os.system(' '.join((R_exe,graph_script,graph_data))) + result = executeCommand(' '.join((R_exe,graph_script,graph_data))) if result != 0: exit('Unable to graph data: %s' % graph_data,1) @@ -65,54 +83,70 @@ def recalibrate(): 'Recalibrate the given bam file' # generate the covariates print 'generating covariates' - generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) - returncode = os.system(generate_covariates) - if returncode != 0: - exit('Unable to generate covariates',1) - - # compute the logistic regression - print 'computing the logistic regression' - LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',output_dir + 'linear_regression_results.out',R_exe,logistic_regression_script) - - # apply the logistic regression, writing the output data to calibrated_bam - print 'applying the correction to the reads' - apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam)) - returncode = os.system(apply_logistic_regression) - if returncode != 0: - exit('Unable to apply logistic regression',1) + linear_regression_results = output_dir + 'linear_regression_results.out' + if output_file_needs_update(linear_regression_results): + generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) + returncode = executeCommand(generate_covariates) + if returncode != 0: + exit('Unable to generate covariates',1) + + # compute the logistic regression + if not dryRun: + print 'computing the logistic regression' + LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script) + else: + print 'Logistic recalibration data files already generated', linear_regression_results + + if output_file_needs_update(calibrated_bam): + # apply the logistic regression, writing the output data to calibrated_bam + print 'applying the correction to the reads' + apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam)) + returncode = executeCommand(apply_logistic_regression) + if returncode != 0: + exit('Unable to apply logistic regression',1) + else: + print 'Recalibrated BAM file already generated', calibrated_bam # index the calibrated bam - print 'indexing the calibrated bam' - index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) - returncode = os.system(index_calibrated_bamfile) - if returncode != 0: - exit('Unable to index calibrated bamfile',1) + if output_file_needs_update(calibrated_bam + '.bai'): + print 'indexing the calibrated bam' + index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) + returncode = executeCommand(index_calibrated_bamfile) + if returncode != 0: + exit('Unable to index calibrated bamfile',1) + else: + print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai' print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam def evaluate(): 'Evaluate recalibration results.' print 'Evaluating recalibration results' - # regenerate the covariates - regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) - print 'regenerating covariates' - returncode = os.system(regenerate_covariates) - if returncode != 0: - exit('Unable to regenerate covariates',1) - print 'graphing initial results' - for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'): - graph_file(empirical_vs_reported_grapher,filename) - print 'graphing final results' - for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'): - graph_file(empirical_vs_reported_grapher,filename) + recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out' + if output_file_needs_update(recalibrated_regression_results): + # regenerate the covariates + regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) + print 'regenerating covariates' + returncode = executeCommand(regenerate_covariates) + if returncode != 0: + exit('Unable to regenerate covariates',1) + + print 'graphing initial results' + for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'): + graph_file(empirical_vs_reported_grapher,filename) + print 'graphing final results' + for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'): + graph_file(empirical_vs_reported_grapher,filename) + else: + print 'Evaluated files already generated', recalibrated_regression_results def usage(): exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] ',1) # Try to parse the given command-line arguments. try: - opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate']) + opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry']) except getopt.GetoptError, err: usage() @@ -123,12 +157,18 @@ if len(args) < 2: # Determine whether to evaluate / recalibrate. recalibrate_requested = False evaluate_requested = False +reprocessFiles = False +dryRun = False for opt,arg in opts: if opt == '--recalibrate': recalibrate_requested = True if opt == '--evaluate': evaluate_requested = True - + if opt == '--reprocess': + reprocessFiles = True + if opt == '--dry': + dryRun = True + # Default to 'recalibrate' unless the user specified only evaluate. do_recalibration = not (evaluate_requested and not recalibrate_requested) # Only evaluate if the user specifically requested evaluation.