gatk-3.8/python/collectCalls.py

534 lines
21 KiB
Python
Executable File

#!/usr/bin/env python
# imports
import subprocess
import os
import sys
import re
import time
import math
import codecs
## script keys and triggers
remove_all_files = False # be very careful with this one
make_directories = True
copy_vcf_files = True
annotate_vcf_files = True
run_snp_selector = False
make_hard_threshold_vcf = False
snp_select_threshold_vcf = False
generate_hapmap_info = False
generate_threshold_hapmap_info = False
false_negatives_by_snp_selector = False
false_negatives_by_snp_selector_on_threshold = False
plot_info_field_metrics = False
plot_info_field_metrics_threshold = False
make_best_effort_vcf = False
snp_select_best_effort_vcf = False
plot_best_effort_metrics = False
## global stuff
DEBUG = True
override_pilot_bamfile = True
override_production_bamfile = True
matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts"
pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv"
samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"
home_dir = "/humgen/gsa-scr1/projects/FHS/"
annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/"
figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/"
production_calls_dir = home_dir + "production/raw_production_calls_12_07/"
pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/"
hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/"
pilot_standard_project_name = "framinghampilot"
pilot_clipped_project_name = "pilot_clipped"
pilot_no_gatk_project_name = "pilot_no_gatk"
FHS_project_name = "FHS"
FHS_round_2_project_name = "FHS_round_2"
all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name]
pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name]
production_projects = [FHS_project_name,FHS_round_2_project_name]
gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list"
dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod"
vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
def subSystem(cmd):
if ( DEBUG ):
print(cmd)
else:
os.system(cmd)
def safeRemove(cmd):
if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ):
print("Unsafe removal attempt: "+cmd)
else:
subSystem(cmd)
def productionBamfilePicard(proj, samp):
for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines():
spline = line.strip().split()
if ( spline[1] == proj and spline[2] == samp ):
return spline[5]+spline[3]+"."+spline[4]+".aligned.bam"
print("no return for: "+proj+"_"+samp)
def annotationProjectPath(proj):
return annotations_data_base+proj+"/"
def annotationPath(proj,pool):
return annotationProjectPath(proj)+pool+"/"
def annotationProjectFile(proj, pool, appender):
return annotationPath(proj,pool)+pool+appender
def figureProjectPath(proj):
return figures_base+proj+"/"
def figurePath(proj, samp):
return figureProjectPath(proj)+samp+"/"
def pipelineProjectPath(proj):
return samples_pipeline_path+proj+"/"
def pipelinePath(proj, samp):
return pipelineProjectPath(proj)+samp+"/"
def pipelineBam(proj, samp):
if ( proj == "framinghampilot" and override_pilot_bamfile ):
if ( samp == "CEPH1" ):
return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam"
if ( samp == "CEPH2" ):
return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam"
if ( samp == "CEPH3" ):
return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam"
if ( proj in production_projects and override_production_bamfile ):
return productionBamfilePicard(proj,samp)
return pipelinePath(proj,samp)+proj+"."+samp+".bam"
def pipelineVCF(proj,samp):
return pipelinePath(proj,samp)+samp+".vcf"
def homePath(proj, samp):
if ( proj in pilot_projects ):
return pilot_calls_dir+proj+"/"
else:
return production_calls_dir+proj+"/"
def homeProjectFile(proj, samp, appender):
return homePath(proj,samp)+samp+appender
def homeVCF(proj, samp, extension):
return homeProjectFile(proj,samp,extension)+".vcf"
def hapmapVCF(proj, samp):
return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf"
def poolSize(proj):
if ( proj in pilot_projects ):
return 40
else:
return 28
def poolBinding(proj, samp):
if ( proj in production_projects ):
raise Exception, "Production projects do not have hapmap pool bindings"
else:
return hapmap_pool_info_dir+samp+".pool.path"
def poolSampleNames(proj,samp):
if ( proj in production_projects ):
raise Exception, "Production projects do not have hapmap pool bindings"
else:
return hapmap_pool_info_dir+samp+".pool"
## working stuff
#working_projects = [pilot_standard_project_name]
working_projects = [FHS_round_2_project_name]
working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"]
working_quality_scores = ["0","1","2","3","4","5"]
working_samples = set()
projects_to_samples = []
# create the project --> list of samples dictionary
for project in working_projects:
sample_list = os.listdir(pipelineProjectPath(project))
projects_to_samples.append([project, sample_list])
for s in sample_list:
working_samples.add(s)
projects_to_samples = dict(projects_to_samples)
# create the sample --> pool name dictionary
samples_to_pool_name = []
for line in open(pipeline_samples_info_path).readlines():
spline = line.strip().split()
if(spline[0] in working_samples):
keyval = [spline[0], spline[1]]
samples_to_pool_name.append(keyval)
samples_to_pool_name = dict(samples_to_pool_name)
# remove everything
if ( remove_all_files ):
for project in working_projects:
safeRemove("rm -rf "+homePath(project,"") )
safeRemove("rm -rf "+annotationProjectPath(project))
safeRemove("rm -rf "+figureProjectPath(project))
# make directories if necessary
if ( make_directories ):
for project in working_projects:
subSystem("mkdir "+homePath(project,""))
subSystem("mkdir "+annotationProjectPath(project))
subSystem("mkdir "+figureProjectPath(project))
for sample in projects_to_samples[project]:
subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample]))
subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample]))
# copy the vcf files from the pipeline into the /FHS/ directory
if ( copy_vcf_files ):
for project in working_projects:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw")
pipeline_vcf = pipelineVCF(project,sample)
subSystem("cat "+pipeline_vcf+" | sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf)
# annotate the vcf files using VariantAnnotator
if ( annotate_vcf_files ):
prior_step_extension = "_raw"
this_step_extension = "_annotated"
gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path
for project in working_projects:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
inputVCF = homeVCF(project, pool, prior_step_extension)
outputVCF = homeVCF(project, pool, this_step_extension)
bamfile = pipelineBam(project, sample)
gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF
subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args)
# definition for next section (and another one further down)
def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list):
snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable"
for project in project_list:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
if ( project in pilot_projects ):
truth_arg = " -t "+hapmapVCF(project,sample)
else:
truth_arg = " -titv=3.6"
for info_field in info_field_list:
input_vcf = homeVCF(project,pool,prior_step_extension)
output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field)
log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log")
snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf
subSystem(snp_selector_base+snp_selector_args)
# run snp selector to analyze info fields
if ( run_snp_selector ):
prior_step_extension = "_annotated"
this_step_extension = "_snpsel"
runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields)
# definition for the next stage
def parseInfoField(info):
info_list = info.split(";")
info_dict = []
for info_field in info_list:
keyval = info_field.split("=")
key = keyval[0]
val = float(keyval[1])
info_dict.append([key, val])
return dict(info_dict)
def vcfLinePassesThreshold(line, fields, vals,greaterthan):
if( line.startswith('#') ):
return True
else:
# parse the line
general_fields = line.split()
info_dict = parseInfoField(general_fields[vcf_header.index("INFO")])
header_set = set(vcf_header)
# compare with thresholds
for j in range(len(fields)):
if ( fields[j] in header_set ):
if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ):
if ( not greaterthan[j] ):
return False
else:
continue
else:
if ( greaterthan[j] ):
return False
else:
continue
elif ( fields[j] in info_dict ):
if ( float(info_dict[fields[j]]) > vals[j] ):
if ( not greaterthan[j] ):
return False
else:
continue
else:
if ( greaterthan[j] ):
return False
else:
continue
else:
print("Field not found "+fields[j])
return True
def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater):
if ( DEBUG ):
filtered_lines = 0
else:
out_vcf = open(this_vcf,'w')
print("open for writing: "+this_vcf)
wrotelines = 0
for line in open(prev_vcf).readlines():
if ( vcfLinePassesThreshold(line,fields,vals,greater) ):
if ( DEBUG ):
filtered_lines = filtered_lines + 1
else:
out_vcf.write(line)
wrotelines=wrotelines+1
if ( DEBUG ):
print(this_vcf+" would have filtered "+str(filtered_lines)+" variants")
else:
out_vcf.close()
# make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering )
if ( make_hard_threshold_vcf ):
prior_step_extension = "_annotated"
this_step_extension = "_hard_threshold"
threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"]
threshold_values = [15, 0.4,280]
threshold_greaterthan = [True, False,True]
for project in working_projects:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
prev_vcf = homeVCF(project,pool,prior_step_extension)
this_vcf = homeVCF(project,pool,this_step_extension)
makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan)
# run snp selector on hard-threshold vcf files
if ( snp_select_threshold_vcf ):
prior_step_extension = "_hard_threshold"
this_step_extension = "_hard_snpsel"
runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields)
# definitions for next few steps
def hapmapInfoFile(proj,pool,info,qscore):
return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt")
def generateHapmapInfo(extension, qscorelist):
gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo"
for project in working_projects:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
for infofield in working_info_fields:
inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield)
inputBam = pipelineBam(project, sample)
ps = str(poolSize(project))
for qscore in qscorelist:
outputFile = hapmapInfoFile(project, pool, infofield, qscore)
gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore
subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args)
# generate hapmap (false postive/false negative) info files on filtered vcfs
if ( generate_hapmap_info ):
prior_step_extension = "_snpsel"
generateHapmapInfo(prior_step_extension, working_quality_scores)
if ( generate_threshold_hapmap_info ):
prior_step_extension = "_hard_snpsel"
generateHapmapInfo(prior_step_extension, working_quality_scores)
# definitions for next section
def falsenegSnpSelect(invcf, false_neg_vcf, qscore):
tmp_vcf = "tmp.vcf"
tmp_recal_vcf = "tmp2.vcf"
tmp_log = "tmp.log"
snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf
cmd = "cat "+invcf+" | awk '{if ( substr($1,0,1) == '#' || $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' | sed 's/ /\t/g' > "+tmp.vcf
subSystem(cmd)
subSystem(snpsel_base)
cmd = "rm "+tmp_vcf
subSystem(cmd)
cmd = "rm "+tmp_recal_vcf
subSystem(cmd)
cmd = "rm "+tmp_log
subSystem(cmd)
def falseNegsBySnpSelector(extension):
for project in working_projects:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
for infofield in working_info_fields:
input_vcf = homeVCF(project,pool,extension+"_"+infofield)
for qscore in working_quality_scores:
false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf")
falsenegSnpSelect(input_vcf,false_neg_vcf,qscore)
# generate false negative sites in a vcf file via snp selector
if ( false_negatives_by_snp_selector):
prior_step_extension = "_snpsel"
falseNegsBySnpSelector(prior_step_extension)
# generate false negative sites ina vcf file on the thresholded vcfs
if ( false_negatives_by_snp_selector_on_threshold ):
prior_step_extension = "_hard_snpsel"
falseNegsBySnpSelector(prior_step_extension)
# definition for matlab section
def makeMatlabMetricPlot(extension):
for project in working_projects:
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
for infofield in working_info_fields:
log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log")
figureName = figurePath(project,pool)+pool+extension
cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\""
subSystem(cmd)
# make matlab plots of the metrics (generated from the log files)
if ( plot_info_field_metrics ):
prior_step_extension = '_snpsel'
makeMatlabMetricPlot(prior_step_extension)
# make matlab plots of the metrics from the hard-thresholded log files
if ( plot_info_field_metrics_threshold ):
prior_step_extension = "_hard_snpsel"
makeMatlabMetricPlot(prior_step_extension)
# definitions for best effort section
def bestEffortVCF(proj):
bevcf_path = homeVCF(proj,proj,"_best_effort")
#bevcf = open(bevcf_path,'w')
#bevcf.write("##source=best_effort_vcf")
#bevcf.write("##format=VCRv3.2")
#bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls")
#bevcf.close()
return bevcf_path
def keyFromFields(fields):
return ":".join([fields[0],fields[1]])
def inVCFDict(vdict, fields):
return ( keyFromFields(fields) in vdict )
def addToVCFDict(vdict, fields):
vdict[keyFromFields(fields)] = fields
return vdict
def updateInfoField(key, info1, info2):
if ( key == "HRun" ): # this one doesn't change
return [key, str(info1)]
else:
return [key, str( (float(info1) + float(info2))/2 )]
def incorporateNewInfo(vdict, infodict, fields):
addNPools = True
poskey = keyFromFields(fields)
vline = vdict[poskey]
vinfodict = parseInfoField(vline[vcf_header.index("INFO")])
newInfo = []
all_keys = set(vinfodict.keys()).union(set(infodict.keys()))
for infokey in all_keys:
if ( infokey in vinfodict and infokey in infodict ):
newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) )
elif ( infokey in infodict ):
newInfo.append("=".join([infokey, str(infodict[infokey])]) )
elif ( infokey == "nPools" ):
newPools = str( int(vinfodict[infokey]) + 1 )
newInfo.append("=".join([infokey,newPools]))
addNPools = False
else:
continue
if ( addNPools ):
newInfo.append("nPools=1")
vline[vcf_header.index("INFO")] = ";".join(newInfo)
vdict[poskey] = vline
return vdict
def incorporateQuality(vdict, fields):
poskey = keyFromFields(fields)
vfields = vdict[poskey]
qual_index = vcf_header.index("QUAL")
vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) )
vdict[poskey] = vfields
return vdict
def bestEffortMerge(call_dict,vcf_to_merge_path):
for line in open(vcf_to_merge_path).readlines():
if ( line.startswith("#") ):
continue
else:
vcf_fields = line.strip().split()
if ( not inVCFDict(call_dict, vcf_fields) ):
call_dict = addToVCFDict(call_dict,vcf_fields)
else:
vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")])
call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields)
call_dict = incorporateQuality(call_dict,vcf_fields)
return call_dict
def dictToFile(vdict, filepath,source,samplename):
file = open(filepath,'w')
file.write("##source="+source)
file.write("##version=VCRv3.2")
file.write("#"+"\t".join(vcf_header)+"\t"+samplename)
if ( vdict ):
for key in vdict.keys():
file.write("\t".join(vdict[key])+"\n")
file.close()
else:
print("dict had no keys for "+filepath)
# make the best effort VCF
if ( make_best_effort_vcf ):
best_thresh_fields = ["QUAL"]
best_thresh_val = [7]
best_thresh_gt = [True]
prev_extension = "_annotated"
for project in working_projects:
best_effort_vcf = bestEffortVCF(project)
best_effort_dict = dict()
sample_list = projects_to_samples[project]
for sample in sample_list:
pool = samples_to_pool_name[sample]
in_vcf = homeVCF(project,pool,prev_extension)
outvcf = homeVCF(project,pool,"_temporary")
makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt)
print("dict about to update, size is currently: "+str(len(best_effort_dict)))
best_effort_dict = bestEffortMerge(best_effort_dict,outvcf)
print("updated size: "+str(len(best_effort_dict)))
subSystem("rm "+outvcf)
dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort")
if ( snp_select_best_effort_vcf ):
snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields)
for project in working_project:
best_effort_vcf = bestEffortVCF(project)
snp_recal_vcf = homeVCF(project,project,"_best_effort_selected")
log = annotationProjectFile(project,project,"_best_effort_selected.log")