534 lines
21 KiB
Python
534 lines
21 KiB
Python
|
|
#!/usr/bin/env python
|
||
|
|
# imports
|
||
|
|
import subprocess
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import re
|
||
|
|
import time
|
||
|
|
import math
|
||
|
|
import codecs
|
||
|
|
## script keys and triggers
|
||
|
|
remove_all_files = False # be very careful with this one
|
||
|
|
make_directories = True
|
||
|
|
copy_vcf_files = True
|
||
|
|
annotate_vcf_files = True
|
||
|
|
run_snp_selector = False
|
||
|
|
make_hard_threshold_vcf = False
|
||
|
|
snp_select_threshold_vcf = False
|
||
|
|
generate_hapmap_info = False
|
||
|
|
generate_threshold_hapmap_info = False
|
||
|
|
false_negatives_by_snp_selector = False
|
||
|
|
false_negatives_by_snp_selector_on_threshold = False
|
||
|
|
plot_info_field_metrics = False
|
||
|
|
plot_info_field_metrics_threshold = False
|
||
|
|
make_best_effort_vcf = False
|
||
|
|
snp_select_best_effort_vcf = False
|
||
|
|
plot_best_effort_metrics = False
|
||
|
|
|
||
|
|
|
||
|
|
## global stuff
|
||
|
|
DEBUG = True
|
||
|
|
override_pilot_bamfile = True
|
||
|
|
override_production_bamfile = True
|
||
|
|
matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts"
|
||
|
|
pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv"
|
||
|
|
samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"
|
||
|
|
home_dir = "/humgen/gsa-scr1/projects/FHS/"
|
||
|
|
annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/"
|
||
|
|
figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/"
|
||
|
|
production_calls_dir = home_dir + "production/raw_production_calls_12_07/"
|
||
|
|
pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/"
|
||
|
|
hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/"
|
||
|
|
pilot_standard_project_name = "framinghampilot"
|
||
|
|
pilot_clipped_project_name = "pilot_clipped"
|
||
|
|
pilot_no_gatk_project_name = "pilot_no_gatk"
|
||
|
|
FHS_project_name = "FHS"
|
||
|
|
FHS_round_2_project_name = "FHS_round_2"
|
||
|
|
all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name]
|
||
|
|
pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name]
|
||
|
|
production_projects = [FHS_project_name,FHS_round_2_project_name]
|
||
|
|
|
||
|
|
gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list"
|
||
|
|
dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod"
|
||
|
|
|
||
|
|
vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
|
||
|
|
|
||
|
|
def subSystem(cmd):
|
||
|
|
if ( DEBUG ):
|
||
|
|
print(cmd)
|
||
|
|
else:
|
||
|
|
os.system(cmd)
|
||
|
|
|
||
|
|
def safeRemove(cmd):
|
||
|
|
if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ):
|
||
|
|
print("Unsafe removal attempt: "+cmd)
|
||
|
|
else:
|
||
|
|
subSystem(cmd)
|
||
|
|
|
||
|
|
def productionBamfilePicard(proj, samp):
|
||
|
|
for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines():
|
||
|
|
spline = line.strip().split()
|
||
|
|
if ( spline[1] == proj and spline[2] == samp ):
|
||
|
|
return spline[5]+spline[3]+"."+spline[4]+".aligned.bam"
|
||
|
|
print("no return for: "+proj+"_"+samp)
|
||
|
|
|
||
|
|
def annotationProjectPath(proj):
|
||
|
|
return annotations_data_base+proj+"/"
|
||
|
|
|
||
|
|
def annotationPath(proj,pool):
|
||
|
|
return annotationProjectPath(proj)+pool+"/"
|
||
|
|
|
||
|
|
def annotationProjectFile(proj, pool, appender):
|
||
|
|
return annotationPath(proj,pool)+pool+appender
|
||
|
|
|
||
|
|
def figureProjectPath(proj):
|
||
|
|
return figures_base+proj+"/"
|
||
|
|
|
||
|
|
def figurePath(proj, samp):
|
||
|
|
return figureProjectPath(proj)+samp+"/"
|
||
|
|
|
||
|
|
def pipelineProjectPath(proj):
|
||
|
|
return samples_pipeline_path+proj+"/"
|
||
|
|
|
||
|
|
def pipelinePath(proj, samp):
|
||
|
|
return pipelineProjectPath(proj)+samp+"/"
|
||
|
|
|
||
|
|
def pipelineBam(proj, samp):
|
||
|
|
if ( proj == "framinghampilot" and override_pilot_bamfile ):
|
||
|
|
if ( samp == "CEPH1" ):
|
||
|
|
return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam"
|
||
|
|
if ( samp == "CEPH2" ):
|
||
|
|
return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam"
|
||
|
|
if ( samp == "CEPH3" ):
|
||
|
|
return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam"
|
||
|
|
|
||
|
|
if ( proj in production_projects and override_production_bamfile ):
|
||
|
|
return productionBamfilePicard(proj,samp)
|
||
|
|
|
||
|
|
return pipelinePath(proj,samp)+proj+"."+samp+".bam"
|
||
|
|
|
||
|
|
def pipelineVCF(proj,samp):
|
||
|
|
return pipelinePath(proj,samp)+samp+".vcf"
|
||
|
|
|
||
|
|
def homePath(proj, samp):
|
||
|
|
if ( proj in pilot_projects ):
|
||
|
|
return pilot_calls_dir+proj+"/"
|
||
|
|
else:
|
||
|
|
return production_calls_dir+proj+"/"
|
||
|
|
|
||
|
|
def homeProjectFile(proj, samp, appender):
|
||
|
|
return homePath(proj,samp)+samp+appender
|
||
|
|
|
||
|
|
def homeVCF(proj, samp, extension):
|
||
|
|
return homeProjectFile(proj,samp,extension)+".vcf"
|
||
|
|
|
||
|
|
def hapmapVCF(proj, samp):
|
||
|
|
return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf"
|
||
|
|
|
||
|
|
def poolSize(proj):
|
||
|
|
if ( proj in pilot_projects ):
|
||
|
|
return 40
|
||
|
|
else:
|
||
|
|
return 28
|
||
|
|
|
||
|
|
def poolBinding(proj, samp):
|
||
|
|
if ( proj in production_projects ):
|
||
|
|
raise Exception, "Production projects do not have hapmap pool bindings"
|
||
|
|
else:
|
||
|
|
return hapmap_pool_info_dir+samp+".pool.path"
|
||
|
|
|
||
|
|
def poolSampleNames(proj,samp):
|
||
|
|
if ( proj in production_projects ):
|
||
|
|
raise Exception, "Production projects do not have hapmap pool bindings"
|
||
|
|
else:
|
||
|
|
return hapmap_pool_info_dir+samp+".pool"
|
||
|
|
|
||
|
|
|
||
|
|
## working stuff
|
||
|
|
|
||
|
|
#working_projects = [pilot_standard_project_name]
|
||
|
|
working_projects = [FHS_round_2_project_name]
|
||
|
|
working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"]
|
||
|
|
working_quality_scores = ["0","1","2","3","4","5"]
|
||
|
|
working_samples = set()
|
||
|
|
projects_to_samples = []
|
||
|
|
|
||
|
|
# create the project --> list of samples dictionary
|
||
|
|
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = os.listdir(pipelineProjectPath(project))
|
||
|
|
projects_to_samples.append([project, sample_list])
|
||
|
|
for s in sample_list:
|
||
|
|
working_samples.add(s)
|
||
|
|
|
||
|
|
projects_to_samples = dict(projects_to_samples)
|
||
|
|
|
||
|
|
# create the sample --> pool name dictionary
|
||
|
|
|
||
|
|
samples_to_pool_name = []
|
||
|
|
|
||
|
|
for line in open(pipeline_samples_info_path).readlines():
|
||
|
|
spline = line.strip().split()
|
||
|
|
if(spline[0] in working_samples):
|
||
|
|
keyval = [spline[0], spline[1]]
|
||
|
|
samples_to_pool_name.append(keyval)
|
||
|
|
|
||
|
|
samples_to_pool_name = dict(samples_to_pool_name)
|
||
|
|
|
||
|
|
# remove everything
|
||
|
|
if ( remove_all_files ):
|
||
|
|
for project in working_projects:
|
||
|
|
safeRemove("rm -rf "+homePath(project,"") )
|
||
|
|
safeRemove("rm -rf "+annotationProjectPath(project))
|
||
|
|
safeRemove("rm -rf "+figureProjectPath(project))
|
||
|
|
|
||
|
|
# make directories if necessary
|
||
|
|
if ( make_directories ):
|
||
|
|
for project in working_projects:
|
||
|
|
subSystem("mkdir "+homePath(project,""))
|
||
|
|
subSystem("mkdir "+annotationProjectPath(project))
|
||
|
|
subSystem("mkdir "+figureProjectPath(project))
|
||
|
|
for sample in projects_to_samples[project]:
|
||
|
|
subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample]))
|
||
|
|
subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample]))
|
||
|
|
|
||
|
|
# copy the vcf files from the pipeline into the /FHS/ directory
|
||
|
|
if ( copy_vcf_files ):
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw")
|
||
|
|
pipeline_vcf = pipelineVCF(project,sample)
|
||
|
|
subSystem("cat "+pipeline_vcf+" | sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf)
|
||
|
|
|
||
|
|
# annotate the vcf files using VariantAnnotator
|
||
|
|
if ( annotate_vcf_files ):
|
||
|
|
prior_step_extension = "_raw"
|
||
|
|
this_step_extension = "_annotated"
|
||
|
|
gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
inputVCF = homeVCF(project, pool, prior_step_extension)
|
||
|
|
outputVCF = homeVCF(project, pool, this_step_extension)
|
||
|
|
bamfile = pipelineBam(project, sample)
|
||
|
|
gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF
|
||
|
|
subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args)
|
||
|
|
|
||
|
|
# definition for next section (and another one further down)
|
||
|
|
|
||
|
|
def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list):
|
||
|
|
snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable"
|
||
|
|
for project in project_list:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
if ( project in pilot_projects ):
|
||
|
|
truth_arg = " -t "+hapmapVCF(project,sample)
|
||
|
|
else:
|
||
|
|
truth_arg = " -titv=3.6"
|
||
|
|
for info_field in info_field_list:
|
||
|
|
input_vcf = homeVCF(project,pool,prior_step_extension)
|
||
|
|
output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field)
|
||
|
|
log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log")
|
||
|
|
snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf
|
||
|
|
subSystem(snp_selector_base+snp_selector_args)
|
||
|
|
|
||
|
|
# run snp selector to analyze info fields
|
||
|
|
if ( run_snp_selector ):
|
||
|
|
prior_step_extension = "_annotated"
|
||
|
|
this_step_extension = "_snpsel"
|
||
|
|
runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields)
|
||
|
|
|
||
|
|
# definition for the next stage
|
||
|
|
|
||
|
|
def parseInfoField(info):
|
||
|
|
info_list = info.split(";")
|
||
|
|
info_dict = []
|
||
|
|
for info_field in info_list:
|
||
|
|
keyval = info_field.split("=")
|
||
|
|
key = keyval[0]
|
||
|
|
val = float(keyval[1])
|
||
|
|
info_dict.append([key, val])
|
||
|
|
return dict(info_dict)
|
||
|
|
|
||
|
|
|
||
|
|
def vcfLinePassesThreshold(line, fields, vals,greaterthan):
|
||
|
|
if( line.startswith('#') ):
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
# parse the line
|
||
|
|
general_fields = line.split()
|
||
|
|
info_dict = parseInfoField(general_fields[vcf_header.index("INFO")])
|
||
|
|
header_set = set(vcf_header)
|
||
|
|
# compare with thresholds
|
||
|
|
for j in range(len(fields)):
|
||
|
|
if ( fields[j] in header_set ):
|
||
|
|
if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ):
|
||
|
|
if ( not greaterthan[j] ):
|
||
|
|
return False
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
if ( greaterthan[j] ):
|
||
|
|
return False
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
elif ( fields[j] in info_dict ):
|
||
|
|
if ( float(info_dict[fields[j]]) > vals[j] ):
|
||
|
|
if ( not greaterthan[j] ):
|
||
|
|
return False
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
if ( greaterthan[j] ):
|
||
|
|
return False
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
print("Field not found "+fields[j])
|
||
|
|
return True
|
||
|
|
|
||
|
|
def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater):
|
||
|
|
if ( DEBUG ):
|
||
|
|
filtered_lines = 0
|
||
|
|
else:
|
||
|
|
out_vcf = open(this_vcf,'w')
|
||
|
|
print("open for writing: "+this_vcf)
|
||
|
|
wrotelines = 0
|
||
|
|
for line in open(prev_vcf).readlines():
|
||
|
|
if ( vcfLinePassesThreshold(line,fields,vals,greater) ):
|
||
|
|
if ( DEBUG ):
|
||
|
|
filtered_lines = filtered_lines + 1
|
||
|
|
else:
|
||
|
|
out_vcf.write(line)
|
||
|
|
wrotelines=wrotelines+1
|
||
|
|
if ( DEBUG ):
|
||
|
|
print(this_vcf+" would have filtered "+str(filtered_lines)+" variants")
|
||
|
|
else:
|
||
|
|
out_vcf.close()
|
||
|
|
|
||
|
|
# make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering )
|
||
|
|
if ( make_hard_threshold_vcf ):
|
||
|
|
prior_step_extension = "_annotated"
|
||
|
|
this_step_extension = "_hard_threshold"
|
||
|
|
threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"]
|
||
|
|
threshold_values = [15, 0.4,280]
|
||
|
|
threshold_greaterthan = [True, False,True]
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
prev_vcf = homeVCF(project,pool,prior_step_extension)
|
||
|
|
this_vcf = homeVCF(project,pool,this_step_extension)
|
||
|
|
makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan)
|
||
|
|
|
||
|
|
# run snp selector on hard-threshold vcf files
|
||
|
|
if ( snp_select_threshold_vcf ):
|
||
|
|
prior_step_extension = "_hard_threshold"
|
||
|
|
this_step_extension = "_hard_snpsel"
|
||
|
|
runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields)
|
||
|
|
|
||
|
|
# definitions for next few steps
|
||
|
|
|
||
|
|
def hapmapInfoFile(proj,pool,info,qscore):
|
||
|
|
return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt")
|
||
|
|
|
||
|
|
def generateHapmapInfo(extension, qscorelist):
|
||
|
|
gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo"
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
for infofield in working_info_fields:
|
||
|
|
inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield)
|
||
|
|
inputBam = pipelineBam(project, sample)
|
||
|
|
ps = str(poolSize(project))
|
||
|
|
for qscore in qscorelist:
|
||
|
|
outputFile = hapmapInfoFile(project, pool, infofield, qscore)
|
||
|
|
gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore
|
||
|
|
subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args)
|
||
|
|
|
||
|
|
# generate hapmap (false postive/false negative) info files on filtered vcfs
|
||
|
|
if ( generate_hapmap_info ):
|
||
|
|
prior_step_extension = "_snpsel"
|
||
|
|
generateHapmapInfo(prior_step_extension, working_quality_scores)
|
||
|
|
|
||
|
|
if ( generate_threshold_hapmap_info ):
|
||
|
|
prior_step_extension = "_hard_snpsel"
|
||
|
|
generateHapmapInfo(prior_step_extension, working_quality_scores)
|
||
|
|
|
||
|
|
# definitions for next section
|
||
|
|
def falsenegSnpSelect(invcf, false_neg_vcf, qscore):
|
||
|
|
tmp_vcf = "tmp.vcf"
|
||
|
|
tmp_recal_vcf = "tmp2.vcf"
|
||
|
|
tmp_log = "tmp.log"
|
||
|
|
snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf
|
||
|
|
cmd = "cat "+invcf+" | awk '{if ( substr($1,0,1) == '#' || $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' | sed 's/ /\t/g' > "+tmp.vcf
|
||
|
|
subSystem(cmd)
|
||
|
|
subSystem(snpsel_base)
|
||
|
|
cmd = "rm "+tmp_vcf
|
||
|
|
subSystem(cmd)
|
||
|
|
cmd = "rm "+tmp_recal_vcf
|
||
|
|
subSystem(cmd)
|
||
|
|
cmd = "rm "+tmp_log
|
||
|
|
subSystem(cmd)
|
||
|
|
|
||
|
|
def falseNegsBySnpSelector(extension):
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
for infofield in working_info_fields:
|
||
|
|
input_vcf = homeVCF(project,pool,extension+"_"+infofield)
|
||
|
|
for qscore in working_quality_scores:
|
||
|
|
false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf")
|
||
|
|
falsenegSnpSelect(input_vcf,false_neg_vcf,qscore)
|
||
|
|
|
||
|
|
# generate false negative sites in a vcf file via snp selector
|
||
|
|
if ( false_negatives_by_snp_selector):
|
||
|
|
prior_step_extension = "_snpsel"
|
||
|
|
falseNegsBySnpSelector(prior_step_extension)
|
||
|
|
|
||
|
|
# generate false negative sites ina vcf file on the thresholded vcfs
|
||
|
|
if ( false_negatives_by_snp_selector_on_threshold ):
|
||
|
|
prior_step_extension = "_hard_snpsel"
|
||
|
|
falseNegsBySnpSelector(prior_step_extension)
|
||
|
|
|
||
|
|
# definition for matlab section
|
||
|
|
def makeMatlabMetricPlot(extension):
|
||
|
|
for project in working_projects:
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
for infofield in working_info_fields:
|
||
|
|
log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log")
|
||
|
|
figureName = figurePath(project,pool)+pool+extension
|
||
|
|
cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\""
|
||
|
|
subSystem(cmd)
|
||
|
|
|
||
|
|
# make matlab plots of the metrics (generated from the log files)
|
||
|
|
if ( plot_info_field_metrics ):
|
||
|
|
prior_step_extension = '_snpsel'
|
||
|
|
makeMatlabMetricPlot(prior_step_extension)
|
||
|
|
|
||
|
|
# make matlab plots of the metrics from the hard-thresholded log files
|
||
|
|
if ( plot_info_field_metrics_threshold ):
|
||
|
|
prior_step_extension = "_hard_snpsel"
|
||
|
|
makeMatlabMetricPlot(prior_step_extension)
|
||
|
|
|
||
|
|
# definitions for best effort section
|
||
|
|
|
||
|
|
def bestEffortVCF(proj):
|
||
|
|
bevcf_path = homeVCF(proj,proj,"_best_effort")
|
||
|
|
#bevcf = open(bevcf_path,'w')
|
||
|
|
#bevcf.write("##source=best_effort_vcf")
|
||
|
|
#bevcf.write("##format=VCRv3.2")
|
||
|
|
#bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls")
|
||
|
|
#bevcf.close()
|
||
|
|
return bevcf_path
|
||
|
|
|
||
|
|
def keyFromFields(fields):
|
||
|
|
return ":".join([fields[0],fields[1]])
|
||
|
|
|
||
|
|
def inVCFDict(vdict, fields):
|
||
|
|
return ( keyFromFields(fields) in vdict )
|
||
|
|
|
||
|
|
def addToVCFDict(vdict, fields):
|
||
|
|
vdict[keyFromFields(fields)] = fields
|
||
|
|
return vdict
|
||
|
|
|
||
|
|
def updateInfoField(key, info1, info2):
|
||
|
|
if ( key == "HRun" ): # this one doesn't change
|
||
|
|
return [key, str(info1)]
|
||
|
|
else:
|
||
|
|
return [key, str( (float(info1) + float(info2))/2 )]
|
||
|
|
|
||
|
|
def incorporateNewInfo(vdict, infodict, fields):
|
||
|
|
addNPools = True
|
||
|
|
poskey = keyFromFields(fields)
|
||
|
|
vline = vdict[poskey]
|
||
|
|
vinfodict = parseInfoField(vline[vcf_header.index("INFO")])
|
||
|
|
newInfo = []
|
||
|
|
all_keys = set(vinfodict.keys()).union(set(infodict.keys()))
|
||
|
|
for infokey in all_keys:
|
||
|
|
if ( infokey in vinfodict and infokey in infodict ):
|
||
|
|
newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) )
|
||
|
|
elif ( infokey in infodict ):
|
||
|
|
newInfo.append("=".join([infokey, str(infodict[infokey])]) )
|
||
|
|
elif ( infokey == "nPools" ):
|
||
|
|
newPools = str( int(vinfodict[infokey]) + 1 )
|
||
|
|
newInfo.append("=".join([infokey,newPools]))
|
||
|
|
addNPools = False
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
if ( addNPools ):
|
||
|
|
newInfo.append("nPools=1")
|
||
|
|
vline[vcf_header.index("INFO")] = ";".join(newInfo)
|
||
|
|
vdict[poskey] = vline
|
||
|
|
return vdict
|
||
|
|
|
||
|
|
def incorporateQuality(vdict, fields):
|
||
|
|
poskey = keyFromFields(fields)
|
||
|
|
vfields = vdict[poskey]
|
||
|
|
qual_index = vcf_header.index("QUAL")
|
||
|
|
vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) )
|
||
|
|
vdict[poskey] = vfields
|
||
|
|
return vdict
|
||
|
|
|
||
|
|
def bestEffortMerge(call_dict,vcf_to_merge_path):
|
||
|
|
for line in open(vcf_to_merge_path).readlines():
|
||
|
|
if ( line.startswith("#") ):
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
vcf_fields = line.strip().split()
|
||
|
|
if ( not inVCFDict(call_dict, vcf_fields) ):
|
||
|
|
call_dict = addToVCFDict(call_dict,vcf_fields)
|
||
|
|
else:
|
||
|
|
vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")])
|
||
|
|
call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields)
|
||
|
|
call_dict = incorporateQuality(call_dict,vcf_fields)
|
||
|
|
return call_dict
|
||
|
|
|
||
|
|
def dictToFile(vdict, filepath,source,samplename):
|
||
|
|
file = open(filepath,'w')
|
||
|
|
file.write("##source="+source)
|
||
|
|
file.write("##version=VCRv3.2")
|
||
|
|
file.write("#"+"\t".join(vcf_header)+"\t"+samplename)
|
||
|
|
if ( vdict ):
|
||
|
|
for key in vdict.keys():
|
||
|
|
file.write("\t".join(vdict[key])+"\n")
|
||
|
|
file.close()
|
||
|
|
else:
|
||
|
|
print("dict had no keys for "+filepath)
|
||
|
|
|
||
|
|
# make the best effort VCF
|
||
|
|
if ( make_best_effort_vcf ):
|
||
|
|
best_thresh_fields = ["QUAL"]
|
||
|
|
best_thresh_val = [7]
|
||
|
|
best_thresh_gt = [True]
|
||
|
|
prev_extension = "_annotated"
|
||
|
|
for project in working_projects:
|
||
|
|
best_effort_vcf = bestEffortVCF(project)
|
||
|
|
best_effort_dict = dict()
|
||
|
|
sample_list = projects_to_samples[project]
|
||
|
|
for sample in sample_list:
|
||
|
|
pool = samples_to_pool_name[sample]
|
||
|
|
in_vcf = homeVCF(project,pool,prev_extension)
|
||
|
|
outvcf = homeVCF(project,pool,"_temporary")
|
||
|
|
makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt)
|
||
|
|
print("dict about to update, size is currently: "+str(len(best_effort_dict)))
|
||
|
|
best_effort_dict = bestEffortMerge(best_effort_dict,outvcf)
|
||
|
|
print("updated size: "+str(len(best_effort_dict)))
|
||
|
|
subSystem("rm "+outvcf)
|
||
|
|
dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort")
|
||
|
|
|
||
|
|
if ( snp_select_best_effort_vcf ):
|
||
|
|
snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields)
|
||
|
|
for project in working_project:
|
||
|
|
best_effort_vcf = bestEffortVCF(project)
|
||
|
|
snp_recal_vcf = homeVCF(project,project,"_best_effort_selected")
|
||
|
|
log = annotationProjectFile(project,project,"_best_effort_selected.log")
|