#!/usr/bin/env python # imports import subprocess import os import sys import re import time import math import codecs ## script keys and triggers remove_all_files = False # be very careful with this one make_directories = True copy_vcf_files = True annotate_vcf_files = True run_snp_selector = False make_hard_threshold_vcf = False snp_select_threshold_vcf = False generate_hapmap_info = False generate_threshold_hapmap_info = False false_negatives_by_snp_selector = False false_negatives_by_snp_selector_on_threshold = False plot_info_field_metrics = False plot_info_field_metrics_threshold = False make_best_effort_vcf = False snp_select_best_effort_vcf = False plot_best_effort_metrics = False ## global stuff DEBUG = True override_pilot_bamfile = True override_production_bamfile = True matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts" pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv" samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/" home_dir = "/humgen/gsa-scr1/projects/FHS/" annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/" figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/" production_calls_dir = home_dir + "production/raw_production_calls_12_07/" pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/" hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/" pilot_standard_project_name = "framinghampilot" pilot_clipped_project_name = "pilot_clipped" pilot_no_gatk_project_name = "pilot_no_gatk" FHS_project_name = "FHS" FHS_round_2_project_name = "FHS_round_2" all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name] pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name] production_projects = [FHS_project_name,FHS_round_2_project_name] gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list" dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod" vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] def subSystem(cmd): if ( DEBUG ): print(cmd) else: os.system(cmd) def safeRemove(cmd): if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ): print("Unsafe removal attempt: "+cmd) else: subSystem(cmd) def productionBamfilePicard(proj, samp): for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines(): spline = line.strip().split() if ( spline[1] == proj and spline[2] == samp ): return spline[5]+spline[3]+"."+spline[4]+".aligned.bam" print("no return for: "+proj+"_"+samp) def annotationProjectPath(proj): return annotations_data_base+proj+"/" def annotationPath(proj,pool): return annotationProjectPath(proj)+pool+"/" def annotationProjectFile(proj, pool, appender): return annotationPath(proj,pool)+pool+appender def figureProjectPath(proj): return figures_base+proj+"/" def figurePath(proj, samp): return figureProjectPath(proj)+samp+"/" def pipelineProjectPath(proj): return samples_pipeline_path+proj+"/" def pipelinePath(proj, samp): return pipelineProjectPath(proj)+samp+"/" def pipelineBam(proj, samp): if ( proj == "framinghampilot" and override_pilot_bamfile ): if ( samp == "CEPH1" ): return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam" if ( samp == "CEPH2" ): return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam" if ( samp == "CEPH3" ): return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam" if ( proj in production_projects and override_production_bamfile ): return productionBamfilePicard(proj,samp) return pipelinePath(proj,samp)+proj+"."+samp+".bam" def pipelineVCF(proj,samp): return pipelinePath(proj,samp)+samp+".vcf" def homePath(proj, samp): if ( proj in pilot_projects ): return pilot_calls_dir+proj+"/" else: return production_calls_dir+proj+"/" def homeProjectFile(proj, samp, appender): return homePath(proj,samp)+samp+appender def homeVCF(proj, samp, extension): return homeProjectFile(proj,samp,extension)+".vcf" def hapmapVCF(proj, samp): return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf" def poolSize(proj): if ( proj in pilot_projects ): return 40 else: return 28 def poolBinding(proj, samp): if ( proj in production_projects ): raise Exception, "Production projects do not have hapmap pool bindings" else: return hapmap_pool_info_dir+samp+".pool.path" def poolSampleNames(proj,samp): if ( proj in production_projects ): raise Exception, "Production projects do not have hapmap pool bindings" else: return hapmap_pool_info_dir+samp+".pool" ## working stuff #working_projects = [pilot_standard_project_name] working_projects = [FHS_round_2_project_name] working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"] working_quality_scores = ["0","1","2","3","4","5"] working_samples = set() projects_to_samples = [] # create the project --> list of samples dictionary for project in working_projects: sample_list = os.listdir(pipelineProjectPath(project)) projects_to_samples.append([project, sample_list]) for s in sample_list: working_samples.add(s) projects_to_samples = dict(projects_to_samples) # create the sample --> pool name dictionary samples_to_pool_name = [] for line in open(pipeline_samples_info_path).readlines(): spline = line.strip().split() if(spline[0] in working_samples): keyval = [spline[0], spline[1]] samples_to_pool_name.append(keyval) samples_to_pool_name = dict(samples_to_pool_name) # remove everything if ( remove_all_files ): for project in working_projects: safeRemove("rm -rf "+homePath(project,"") ) safeRemove("rm -rf "+annotationProjectPath(project)) safeRemove("rm -rf "+figureProjectPath(project)) # make directories if necessary if ( make_directories ): for project in working_projects: subSystem("mkdir "+homePath(project,"")) subSystem("mkdir "+annotationProjectPath(project)) subSystem("mkdir "+figureProjectPath(project)) for sample in projects_to_samples[project]: subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample])) subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample])) # copy the vcf files from the pipeline into the /FHS/ directory if ( copy_vcf_files ): for project in working_projects: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw") pipeline_vcf = pipelineVCF(project,sample) subSystem("cat "+pipeline_vcf+" | sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf) # annotate the vcf files using VariantAnnotator if ( annotate_vcf_files ): prior_step_extension = "_raw" this_step_extension = "_annotated" gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path for project in working_projects: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] inputVCF = homeVCF(project, pool, prior_step_extension) outputVCF = homeVCF(project, pool, this_step_extension) bamfile = pipelineBam(project, sample) gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args) # definition for next section (and another one further down) def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list): snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable" for project in project_list: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] if ( project in pilot_projects ): truth_arg = " -t "+hapmapVCF(project,sample) else: truth_arg = " -titv=3.6" for info_field in info_field_list: input_vcf = homeVCF(project,pool,prior_step_extension) output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field) log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log") snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf subSystem(snp_selector_base+snp_selector_args) # run snp selector to analyze info fields if ( run_snp_selector ): prior_step_extension = "_annotated" this_step_extension = "_snpsel" runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields) # definition for the next stage def parseInfoField(info): info_list = info.split(";") info_dict = [] for info_field in info_list: keyval = info_field.split("=") key = keyval[0] val = float(keyval[1]) info_dict.append([key, val]) return dict(info_dict) def vcfLinePassesThreshold(line, fields, vals,greaterthan): if( line.startswith('#') ): return True else: # parse the line general_fields = line.split() info_dict = parseInfoField(general_fields[vcf_header.index("INFO")]) header_set = set(vcf_header) # compare with thresholds for j in range(len(fields)): if ( fields[j] in header_set ): if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ): if ( not greaterthan[j] ): return False else: continue else: if ( greaterthan[j] ): return False else: continue elif ( fields[j] in info_dict ): if ( float(info_dict[fields[j]]) > vals[j] ): if ( not greaterthan[j] ): return False else: continue else: if ( greaterthan[j] ): return False else: continue else: print("Field not found "+fields[j]) return True def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater): if ( DEBUG ): filtered_lines = 0 else: out_vcf = open(this_vcf,'w') print("open for writing: "+this_vcf) wrotelines = 0 for line in open(prev_vcf).readlines(): if ( vcfLinePassesThreshold(line,fields,vals,greater) ): if ( DEBUG ): filtered_lines = filtered_lines + 1 else: out_vcf.write(line) wrotelines=wrotelines+1 if ( DEBUG ): print(this_vcf+" would have filtered "+str(filtered_lines)+" variants") else: out_vcf.close() # make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering ) if ( make_hard_threshold_vcf ): prior_step_extension = "_annotated" this_step_extension = "_hard_threshold" threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"] threshold_values = [15, 0.4,280] threshold_greaterthan = [True, False,True] for project in working_projects: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] prev_vcf = homeVCF(project,pool,prior_step_extension) this_vcf = homeVCF(project,pool,this_step_extension) makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan) # run snp selector on hard-threshold vcf files if ( snp_select_threshold_vcf ): prior_step_extension = "_hard_threshold" this_step_extension = "_hard_snpsel" runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields) # definitions for next few steps def hapmapInfoFile(proj,pool,info,qscore): return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt") def generateHapmapInfo(extension, qscorelist): gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo" for project in working_projects: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] for infofield in working_info_fields: inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield) inputBam = pipelineBam(project, sample) ps = str(poolSize(project)) for qscore in qscorelist: outputFile = hapmapInfoFile(project, pool, infofield, qscore) gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args) # generate hapmap (false postive/false negative) info files on filtered vcfs if ( generate_hapmap_info ): prior_step_extension = "_snpsel" generateHapmapInfo(prior_step_extension, working_quality_scores) if ( generate_threshold_hapmap_info ): prior_step_extension = "_hard_snpsel" generateHapmapInfo(prior_step_extension, working_quality_scores) # definitions for next section def falsenegSnpSelect(invcf, false_neg_vcf, qscore): tmp_vcf = "tmp.vcf" tmp_recal_vcf = "tmp2.vcf" tmp_log = "tmp.log" snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf cmd = "cat "+invcf+" | awk '{if ( substr($1,0,1) == '#' || $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' | sed 's/ /\t/g' > "+tmp.vcf subSystem(cmd) subSystem(snpsel_base) cmd = "rm "+tmp_vcf subSystem(cmd) cmd = "rm "+tmp_recal_vcf subSystem(cmd) cmd = "rm "+tmp_log subSystem(cmd) def falseNegsBySnpSelector(extension): for project in working_projects: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] for infofield in working_info_fields: input_vcf = homeVCF(project,pool,extension+"_"+infofield) for qscore in working_quality_scores: false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf") falsenegSnpSelect(input_vcf,false_neg_vcf,qscore) # generate false negative sites in a vcf file via snp selector if ( false_negatives_by_snp_selector): prior_step_extension = "_snpsel" falseNegsBySnpSelector(prior_step_extension) # generate false negative sites ina vcf file on the thresholded vcfs if ( false_negatives_by_snp_selector_on_threshold ): prior_step_extension = "_hard_snpsel" falseNegsBySnpSelector(prior_step_extension) # definition for matlab section def makeMatlabMetricPlot(extension): for project in working_projects: sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] for infofield in working_info_fields: log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log") figureName = figurePath(project,pool)+pool+extension cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\"" subSystem(cmd) # make matlab plots of the metrics (generated from the log files) if ( plot_info_field_metrics ): prior_step_extension = '_snpsel' makeMatlabMetricPlot(prior_step_extension) # make matlab plots of the metrics from the hard-thresholded log files if ( plot_info_field_metrics_threshold ): prior_step_extension = "_hard_snpsel" makeMatlabMetricPlot(prior_step_extension) # definitions for best effort section def bestEffortVCF(proj): bevcf_path = homeVCF(proj,proj,"_best_effort") #bevcf = open(bevcf_path,'w') #bevcf.write("##source=best_effort_vcf") #bevcf.write("##format=VCRv3.2") #bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls") #bevcf.close() return bevcf_path def keyFromFields(fields): return ":".join([fields[0],fields[1]]) def inVCFDict(vdict, fields): return ( keyFromFields(fields) in vdict ) def addToVCFDict(vdict, fields): vdict[keyFromFields(fields)] = fields return vdict def updateInfoField(key, info1, info2): if ( key == "HRun" ): # this one doesn't change return [key, str(info1)] else: return [key, str( (float(info1) + float(info2))/2 )] def incorporateNewInfo(vdict, infodict, fields): addNPools = True poskey = keyFromFields(fields) vline = vdict[poskey] vinfodict = parseInfoField(vline[vcf_header.index("INFO")]) newInfo = [] all_keys = set(vinfodict.keys()).union(set(infodict.keys())) for infokey in all_keys: if ( infokey in vinfodict and infokey in infodict ): newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) ) elif ( infokey in infodict ): newInfo.append("=".join([infokey, str(infodict[infokey])]) ) elif ( infokey == "nPools" ): newPools = str( int(vinfodict[infokey]) + 1 ) newInfo.append("=".join([infokey,newPools])) addNPools = False else: continue if ( addNPools ): newInfo.append("nPools=1") vline[vcf_header.index("INFO")] = ";".join(newInfo) vdict[poskey] = vline return vdict def incorporateQuality(vdict, fields): poskey = keyFromFields(fields) vfields = vdict[poskey] qual_index = vcf_header.index("QUAL") vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) ) vdict[poskey] = vfields return vdict def bestEffortMerge(call_dict,vcf_to_merge_path): for line in open(vcf_to_merge_path).readlines(): if ( line.startswith("#") ): continue else: vcf_fields = line.strip().split() if ( not inVCFDict(call_dict, vcf_fields) ): call_dict = addToVCFDict(call_dict,vcf_fields) else: vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")]) call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields) call_dict = incorporateQuality(call_dict,vcf_fields) return call_dict def dictToFile(vdict, filepath,source,samplename): file = open(filepath,'w') file.write("##source="+source) file.write("##version=VCRv3.2") file.write("#"+"\t".join(vcf_header)+"\t"+samplename) if ( vdict ): for key in vdict.keys(): file.write("\t".join(vdict[key])+"\n") file.close() else: print("dict had no keys for "+filepath) # make the best effort VCF if ( make_best_effort_vcf ): best_thresh_fields = ["QUAL"] best_thresh_val = [7] best_thresh_gt = [True] prev_extension = "_annotated" for project in working_projects: best_effort_vcf = bestEffortVCF(project) best_effort_dict = dict() sample_list = projects_to_samples[project] for sample in sample_list: pool = samples_to_pool_name[sample] in_vcf = homeVCF(project,pool,prev_extension) outvcf = homeVCF(project,pool,"_temporary") makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt) print("dict about to update, size is currently: "+str(len(best_effort_dict))) best_effort_dict = bestEffortMerge(best_effort_dict,outvcf) print("updated size: "+str(len(best_effort_dict))) subSystem("rm "+outvcf) dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort") if ( snp_select_best_effort_vcf ): snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields) for project in working_project: best_effort_vcf = bestEffortVCF(project) snp_recal_vcf = homeVCF(project,project,"_best_effort_selected") log = annotationProjectFile(project,project,"_best_effort_selected.log")