diff --git a/python/collectCalls.py b/python/collectCalls.py new file mode 100755 index 000000000..b58c47328 --- /dev/null +++ b/python/collectCalls.py @@ -0,0 +1,533 @@ +#!/usr/bin/env python +# imports +import subprocess +import os +import sys +import re +import time +import math +import codecs +## script keys and triggers +remove_all_files = False # be very careful with this one +make_directories = True +copy_vcf_files = True +annotate_vcf_files = True +run_snp_selector = False +make_hard_threshold_vcf = False +snp_select_threshold_vcf = False +generate_hapmap_info = False +generate_threshold_hapmap_info = False +false_negatives_by_snp_selector = False +false_negatives_by_snp_selector_on_threshold = False +plot_info_field_metrics = False +plot_info_field_metrics_threshold = False +make_best_effort_vcf = False +snp_select_best_effort_vcf = False +plot_best_effort_metrics = False + + +## global stuff +DEBUG = True +override_pilot_bamfile = True +override_production_bamfile = True +matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts" +pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv" +samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/" +home_dir = "/humgen/gsa-scr1/projects/FHS/" +annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/" +figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/" +production_calls_dir = home_dir + "production/raw_production_calls_12_07/" +pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/" +hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/" +pilot_standard_project_name = "framinghampilot" +pilot_clipped_project_name = "pilot_clipped" +pilot_no_gatk_project_name = "pilot_no_gatk" +FHS_project_name = "FHS" +FHS_round_2_project_name = "FHS_round_2" +all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name] +pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name] +production_projects = [FHS_project_name,FHS_round_2_project_name] + +gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list" +dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod" + +vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] + +def subSystem(cmd): + if ( DEBUG ): + print(cmd) + else: + os.system(cmd) + +def safeRemove(cmd): + if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ): + print("Unsafe removal attempt: "+cmd) + else: + subSystem(cmd) + +def productionBamfilePicard(proj, samp): + for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines(): + spline = line.strip().split() + if ( spline[1] == proj and spline[2] == samp ): + return spline[5]+spline[3]+"."+spline[4]+".aligned.bam" + print("no return for: "+proj+"_"+samp) + +def annotationProjectPath(proj): + return annotations_data_base+proj+"/" + +def annotationPath(proj,pool): + return annotationProjectPath(proj)+pool+"/" + +def annotationProjectFile(proj, pool, appender): + return annotationPath(proj,pool)+pool+appender + +def figureProjectPath(proj): + return figures_base+proj+"/" + +def figurePath(proj, samp): + return figureProjectPath(proj)+samp+"/" + +def pipelineProjectPath(proj): + return samples_pipeline_path+proj+"/" + +def pipelinePath(proj, samp): + return pipelineProjectPath(proj)+samp+"/" + +def pipelineBam(proj, samp): + if ( proj == "framinghampilot" and override_pilot_bamfile ): + if ( samp == "CEPH1" ): + return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam" + if ( samp == "CEPH2" ): + return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam" + if ( samp == "CEPH3" ): + return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam" + + if ( proj in production_projects and override_production_bamfile ): + return productionBamfilePicard(proj,samp) + + return pipelinePath(proj,samp)+proj+"."+samp+".bam" + +def pipelineVCF(proj,samp): + return pipelinePath(proj,samp)+samp+".vcf" + +def homePath(proj, samp): + if ( proj in pilot_projects ): + return pilot_calls_dir+proj+"/" + else: + return production_calls_dir+proj+"/" + +def homeProjectFile(proj, samp, appender): + return homePath(proj,samp)+samp+appender + +def homeVCF(proj, samp, extension): + return homeProjectFile(proj,samp,extension)+".vcf" + +def hapmapVCF(proj, samp): + return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf" + +def poolSize(proj): + if ( proj in pilot_projects ): + return 40 + else: + return 28 + +def poolBinding(proj, samp): + if ( proj in production_projects ): + raise Exception, "Production projects do not have hapmap pool bindings" + else: + return hapmap_pool_info_dir+samp+".pool.path" + +def poolSampleNames(proj,samp): + if ( proj in production_projects ): + raise Exception, "Production projects do not have hapmap pool bindings" + else: + return hapmap_pool_info_dir+samp+".pool" + + +## working stuff + +#working_projects = [pilot_standard_project_name] +working_projects = [FHS_round_2_project_name] +working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"] +working_quality_scores = ["0","1","2","3","4","5"] +working_samples = set() +projects_to_samples = [] + +# create the project --> list of samples dictionary + +for project in working_projects: + sample_list = os.listdir(pipelineProjectPath(project)) + projects_to_samples.append([project, sample_list]) + for s in sample_list: + working_samples.add(s) + +projects_to_samples = dict(projects_to_samples) + +# create the sample --> pool name dictionary + +samples_to_pool_name = [] + +for line in open(pipeline_samples_info_path).readlines(): + spline = line.strip().split() + if(spline[0] in working_samples): + keyval = [spline[0], spline[1]] + samples_to_pool_name.append(keyval) + +samples_to_pool_name = dict(samples_to_pool_name) + +# remove everything +if ( remove_all_files ): + for project in working_projects: + safeRemove("rm -rf "+homePath(project,"") ) + safeRemove("rm -rf "+annotationProjectPath(project)) + safeRemove("rm -rf "+figureProjectPath(project)) + +# make directories if necessary +if ( make_directories ): + for project in working_projects: + subSystem("mkdir "+homePath(project,"")) + subSystem("mkdir "+annotationProjectPath(project)) + subSystem("mkdir "+figureProjectPath(project)) + for sample in projects_to_samples[project]: + subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample])) + subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample])) + +# copy the vcf files from the pipeline into the /FHS/ directory +if ( copy_vcf_files ): + for project in working_projects: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw") + pipeline_vcf = pipelineVCF(project,sample) + subSystem("cat "+pipeline_vcf+" | sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf) + +# annotate the vcf files using VariantAnnotator +if ( annotate_vcf_files ): + prior_step_extension = "_raw" + this_step_extension = "_annotated" + gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path + for project in working_projects: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + inputVCF = homeVCF(project, pool, prior_step_extension) + outputVCF = homeVCF(project, pool, this_step_extension) + bamfile = pipelineBam(project, sample) + gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF + subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args) + +# definition for next section (and another one further down) + +def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list): + snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable" + for project in project_list: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + if ( project in pilot_projects ): + truth_arg = " -t "+hapmapVCF(project,sample) + else: + truth_arg = " -titv=3.6" + for info_field in info_field_list: + input_vcf = homeVCF(project,pool,prior_step_extension) + output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field) + log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log") + snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf + subSystem(snp_selector_base+snp_selector_args) + +# run snp selector to analyze info fields +if ( run_snp_selector ): + prior_step_extension = "_annotated" + this_step_extension = "_snpsel" + runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields) + +# definition for the next stage + +def parseInfoField(info): + info_list = info.split(";") + info_dict = [] + for info_field in info_list: + keyval = info_field.split("=") + key = keyval[0] + val = float(keyval[1]) + info_dict.append([key, val]) + return dict(info_dict) + + +def vcfLinePassesThreshold(line, fields, vals,greaterthan): + if( line.startswith('#') ): + return True + else: + # parse the line + general_fields = line.split() + info_dict = parseInfoField(general_fields[vcf_header.index("INFO")]) + header_set = set(vcf_header) + # compare with thresholds + for j in range(len(fields)): + if ( fields[j] in header_set ): + if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ): + if ( not greaterthan[j] ): + return False + else: + continue + else: + if ( greaterthan[j] ): + return False + else: + continue + elif ( fields[j] in info_dict ): + if ( float(info_dict[fields[j]]) > vals[j] ): + if ( not greaterthan[j] ): + return False + else: + continue + else: + if ( greaterthan[j] ): + return False + else: + continue + else: + print("Field not found "+fields[j]) + return True + +def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater): + if ( DEBUG ): + filtered_lines = 0 + else: + out_vcf = open(this_vcf,'w') + print("open for writing: "+this_vcf) + wrotelines = 0 + for line in open(prev_vcf).readlines(): + if ( vcfLinePassesThreshold(line,fields,vals,greater) ): + if ( DEBUG ): + filtered_lines = filtered_lines + 1 + else: + out_vcf.write(line) + wrotelines=wrotelines+1 + if ( DEBUG ): + print(this_vcf+" would have filtered "+str(filtered_lines)+" variants") + else: + out_vcf.close() + +# make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering ) +if ( make_hard_threshold_vcf ): + prior_step_extension = "_annotated" + this_step_extension = "_hard_threshold" + threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"] + threshold_values = [15, 0.4,280] + threshold_greaterthan = [True, False,True] + for project in working_projects: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + prev_vcf = homeVCF(project,pool,prior_step_extension) + this_vcf = homeVCF(project,pool,this_step_extension) + makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan) + +# run snp selector on hard-threshold vcf files +if ( snp_select_threshold_vcf ): + prior_step_extension = "_hard_threshold" + this_step_extension = "_hard_snpsel" + runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields) + +# definitions for next few steps + +def hapmapInfoFile(proj,pool,info,qscore): + return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt") + +def generateHapmapInfo(extension, qscorelist): + gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo" + for project in working_projects: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + for infofield in working_info_fields: + inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield) + inputBam = pipelineBam(project, sample) + ps = str(poolSize(project)) + for qscore in qscorelist: + outputFile = hapmapInfoFile(project, pool, infofield, qscore) + gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore + subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args) + +# generate hapmap (false postive/false negative) info files on filtered vcfs +if ( generate_hapmap_info ): + prior_step_extension = "_snpsel" + generateHapmapInfo(prior_step_extension, working_quality_scores) + +if ( generate_threshold_hapmap_info ): + prior_step_extension = "_hard_snpsel" + generateHapmapInfo(prior_step_extension, working_quality_scores) + +# definitions for next section +def falsenegSnpSelect(invcf, false_neg_vcf, qscore): + tmp_vcf = "tmp.vcf" + tmp_recal_vcf = "tmp2.vcf" + tmp_log = "tmp.log" + snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf + cmd = "cat "+invcf+" | awk '{if ( substr($1,0,1) == '#' || $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' | sed 's/ /\t/g' > "+tmp.vcf + subSystem(cmd) + subSystem(snpsel_base) + cmd = "rm "+tmp_vcf + subSystem(cmd) + cmd = "rm "+tmp_recal_vcf + subSystem(cmd) + cmd = "rm "+tmp_log + subSystem(cmd) + +def falseNegsBySnpSelector(extension): + for project in working_projects: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + for infofield in working_info_fields: + input_vcf = homeVCF(project,pool,extension+"_"+infofield) + for qscore in working_quality_scores: + false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf") + falsenegSnpSelect(input_vcf,false_neg_vcf,qscore) + +# generate false negative sites in a vcf file via snp selector +if ( false_negatives_by_snp_selector): + prior_step_extension = "_snpsel" + falseNegsBySnpSelector(prior_step_extension) + +# generate false negative sites ina vcf file on the thresholded vcfs +if ( false_negatives_by_snp_selector_on_threshold ): + prior_step_extension = "_hard_snpsel" + falseNegsBySnpSelector(prior_step_extension) + +# definition for matlab section +def makeMatlabMetricPlot(extension): + for project in working_projects: + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + for infofield in working_info_fields: + log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log") + figureName = figurePath(project,pool)+pool+extension + cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\"" + subSystem(cmd) + +# make matlab plots of the metrics (generated from the log files) +if ( plot_info_field_metrics ): + prior_step_extension = '_snpsel' + makeMatlabMetricPlot(prior_step_extension) + +# make matlab plots of the metrics from the hard-thresholded log files +if ( plot_info_field_metrics_threshold ): + prior_step_extension = "_hard_snpsel" + makeMatlabMetricPlot(prior_step_extension) + +# definitions for best effort section + +def bestEffortVCF(proj): + bevcf_path = homeVCF(proj,proj,"_best_effort") + #bevcf = open(bevcf_path,'w') + #bevcf.write("##source=best_effort_vcf") + #bevcf.write("##format=VCRv3.2") + #bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls") + #bevcf.close() + return bevcf_path + +def keyFromFields(fields): + return ":".join([fields[0],fields[1]]) + +def inVCFDict(vdict, fields): + return ( keyFromFields(fields) in vdict ) + +def addToVCFDict(vdict, fields): + vdict[keyFromFields(fields)] = fields + return vdict + +def updateInfoField(key, info1, info2): + if ( key == "HRun" ): # this one doesn't change + return [key, str(info1)] + else: + return [key, str( (float(info1) + float(info2))/2 )] + +def incorporateNewInfo(vdict, infodict, fields): + addNPools = True + poskey = keyFromFields(fields) + vline = vdict[poskey] + vinfodict = parseInfoField(vline[vcf_header.index("INFO")]) + newInfo = [] + all_keys = set(vinfodict.keys()).union(set(infodict.keys())) + for infokey in all_keys: + if ( infokey in vinfodict and infokey in infodict ): + newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) ) + elif ( infokey in infodict ): + newInfo.append("=".join([infokey, str(infodict[infokey])]) ) + elif ( infokey == "nPools" ): + newPools = str( int(vinfodict[infokey]) + 1 ) + newInfo.append("=".join([infokey,newPools])) + addNPools = False + else: + continue + if ( addNPools ): + newInfo.append("nPools=1") + vline[vcf_header.index("INFO")] = ";".join(newInfo) + vdict[poskey] = vline + return vdict + +def incorporateQuality(vdict, fields): + poskey = keyFromFields(fields) + vfields = vdict[poskey] + qual_index = vcf_header.index("QUAL") + vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) ) + vdict[poskey] = vfields + return vdict + +def bestEffortMerge(call_dict,vcf_to_merge_path): + for line in open(vcf_to_merge_path).readlines(): + if ( line.startswith("#") ): + continue + else: + vcf_fields = line.strip().split() + if ( not inVCFDict(call_dict, vcf_fields) ): + call_dict = addToVCFDict(call_dict,vcf_fields) + else: + vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")]) + call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields) + call_dict = incorporateQuality(call_dict,vcf_fields) + return call_dict + +def dictToFile(vdict, filepath,source,samplename): + file = open(filepath,'w') + file.write("##source="+source) + file.write("##version=VCRv3.2") + file.write("#"+"\t".join(vcf_header)+"\t"+samplename) + if ( vdict ): + for key in vdict.keys(): + file.write("\t".join(vdict[key])+"\n") + file.close() + else: + print("dict had no keys for "+filepath) + +# make the best effort VCF +if ( make_best_effort_vcf ): + best_thresh_fields = ["QUAL"] + best_thresh_val = [7] + best_thresh_gt = [True] + prev_extension = "_annotated" + for project in working_projects: + best_effort_vcf = bestEffortVCF(project) + best_effort_dict = dict() + sample_list = projects_to_samples[project] + for sample in sample_list: + pool = samples_to_pool_name[sample] + in_vcf = homeVCF(project,pool,prev_extension) + outvcf = homeVCF(project,pool,"_temporary") + makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt) + print("dict about to update, size is currently: "+str(len(best_effort_dict))) + best_effort_dict = bestEffortMerge(best_effort_dict,outvcf) + print("updated size: "+str(len(best_effort_dict))) + subSystem("rm "+outvcf) + dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort") + +if ( snp_select_best_effort_vcf ): + snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields) + for project in working_project: + best_effort_vcf = bestEffortVCF(project) + snp_recal_vcf = homeVCF(project,project,"_best_effort_selected") + log = annotationProjectFile(project,project,"_best_effort_selected.log")