gatk-3.8/python/collectCalls.py

#!/usr/bin/env python
# imports
import subprocess
import os
import sys
import re
import time
import math
import codecs
## script keys and triggers
remove_all_files = False # be very careful with this one
make_directories = True
copy_vcf_files = True
annotate_vcf_files = True
run_snp_selector = False
make_hard_threshold_vcf = False
snp_select_threshold_vcf = False
generate_hapmap_info = False
generate_threshold_hapmap_info = False
false_negatives_by_snp_selector = False
false_negatives_by_snp_selector_on_threshold = False
plot_info_field_metrics = False
plot_info_field_metrics_threshold = False
make_best_effort_vcf = False
snp_select_best_effort_vcf = False
plot_best_effort_metrics = False


## global stuff
DEBUG = True
override_pilot_bamfile = True
override_production_bamfile = True
matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts"
pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv"
samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"
home_dir = "/humgen/gsa-scr1/projects/FHS/"
annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/"
figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/"
production_calls_dir = home_dir + "production/raw_production_calls_12_07/"
pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/"
hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/"
pilot_standard_project_name = "framinghampilot"
pilot_clipped_project_name = "pilot_clipped"
pilot_no_gatk_project_name = "pilot_no_gatk"
FHS_project_name = "FHS"
FHS_round_2_project_name = "FHS_round_2"
all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name]
pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name]
production_projects = [FHS_project_name,FHS_round_2_project_name]

gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list"
dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod"

vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]

def subSystem(cmd):
    if ( DEBUG ):
        print(cmd)
    else:
        os.system(cmd)

def safeRemove(cmd):
    if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ):
        print("Unsafe removal attempt: "+cmd)
    else:
        subSystem(cmd)

def productionBamfilePicard(proj, samp):
    for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines():
        spline = line.strip().split()
        if ( spline[1] == proj and spline[2] == samp ):
            return spline[5]+spline[3]+"."+spline[4]+".aligned.bam"
    print("no return for: "+proj+"_"+samp)

def annotationProjectPath(proj):
    return annotations_data_base+proj+"/"

def annotationPath(proj,pool):
    return annotationProjectPath(proj)+pool+"/"

def annotationProjectFile(proj, pool, appender):
    return annotationPath(proj,pool)+pool+appender

def figureProjectPath(proj):
    return figures_base+proj+"/"

def figurePath(proj, samp):
    return figureProjectPath(proj)+samp+"/"

def pipelineProjectPath(proj):
    return samples_pipeline_path+proj+"/"

def pipelinePath(proj, samp):
    return pipelineProjectPath(proj)+samp+"/"

def pipelineBam(proj, samp):
    if ( proj == "framinghampilot" and override_pilot_bamfile ):
        if ( samp == "CEPH1" ):
            return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam"
        if ( samp == "CEPH2" ):
            return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam"
        if ( samp == "CEPH3" ):
            return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam"

    if ( proj in production_projects and override_production_bamfile ):
        return productionBamfilePicard(proj,samp)

    return pipelinePath(proj,samp)+proj+"."+samp+".bam"

def pipelineVCF(proj,samp):
    return pipelinePath(proj,samp)+samp+".vcf"

def homePath(proj, samp):
    if ( proj in pilot_projects ):
        return pilot_calls_dir+proj+"/"
    else:
        return production_calls_dir+proj+"/"

def homeProjectFile(proj, samp, appender):
    return homePath(proj,samp)+samp+appender

def homeVCF(proj, samp, extension):
    return homeProjectFile(proj,samp,extension)+".vcf"

def hapmapVCF(proj, samp):
    return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf"

def poolSize(proj):
    if ( proj in pilot_projects ):
        return 40
    else:
        return 28

def poolBinding(proj, samp):
    if ( proj in production_projects ):
        raise Exception, "Production projects do not have hapmap pool bindings"
    else:
        return hapmap_pool_info_dir+samp+".pool.path"

def poolSampleNames(proj,samp):
    if ( proj in production_projects ):
        raise Exception, "Production projects do not have hapmap pool bindings"
    else:
        return hapmap_pool_info_dir+samp+".pool"


## working stuff

#working_projects = [pilot_standard_project_name]
working_projects = [FHS_round_2_project_name]
working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"]
working_quality_scores = ["0","1","2","3","4","5"]
working_samples = set()
projects_to_samples = []

# create the project --> list of samples dictionary

for project in working_projects:
    sample_list = os.listdir(pipelineProjectPath(project))
    projects_to_samples.append([project, sample_list])
    for s in sample_list:
        working_samples.add(s)

projects_to_samples = dict(projects_to_samples)

# create the sample --> pool name dictionary

samples_to_pool_name = []

for line in open(pipeline_samples_info_path).readlines():
    spline = line.strip().split()
    if(spline[0] in working_samples):
        keyval = [spline[0], spline[1]]
        samples_to_pool_name.append(keyval)

samples_to_pool_name = dict(samples_to_pool_name)

# remove everything
if ( remove_all_files ):
    for project in working_projects:
        safeRemove("rm -rf "+homePath(project,"") )
        safeRemove("rm -rf "+annotationProjectPath(project))
        safeRemove("rm -rf "+figureProjectPath(project))

# make directories if necessary
if ( make_directories ):
    for project in working_projects:
        subSystem("mkdir "+homePath(project,""))
        subSystem("mkdir "+annotationProjectPath(project))
        subSystem("mkdir "+figureProjectPath(project))
        for sample in projects_to_samples[project]:
            subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample]))
            subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample]))

# copy the vcf files from the pipeline into the /FHS/ directory
if ( copy_vcf_files ):
    for project in working_projects:
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw")
            pipeline_vcf = pipelineVCF(project,sample)
            subSystem("cat "+pipeline_vcf+" | sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf)

# annotate the vcf files using VariantAnnotator
if ( annotate_vcf_files ):
    prior_step_extension = "_raw"
    this_step_extension = "_annotated"
    gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path
    for project in working_projects:
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            inputVCF = homeVCF(project, pool, prior_step_extension)
            outputVCF = homeVCF(project, pool, this_step_extension)
            bamfile = pipelineBam(project, sample)
            gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF
            subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args)

# definition for next section (and another one further down)

def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list):
    snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable"
    for project in project_list:
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            if ( project in pilot_projects ):
                truth_arg = " -t "+hapmapVCF(project,sample)
            else:
                truth_arg = " -titv=3.6"
            for info_field in info_field_list:
                input_vcf = homeVCF(project,pool,prior_step_extension)
                output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field)
                log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log")
                snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf
                subSystem(snp_selector_base+snp_selector_args)

# run snp selector to analyze info fields
if ( run_snp_selector ):
    prior_step_extension = "_annotated"
    this_step_extension = "_snpsel"
    runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields)

# definition for the next stage

def parseInfoField(info):
    info_list = info.split(";")
    info_dict = []
    for info_field in info_list:
        keyval = info_field.split("=")
        key = keyval[0]
        val = float(keyval[1])
        info_dict.append([key, val])
    return dict(info_dict)


def vcfLinePassesThreshold(line, fields, vals,greaterthan):
    if( line.startswith('#') ):
        return True
    else:
        # parse the line
        general_fields = line.split()
        info_dict = parseInfoField(general_fields[vcf_header.index("INFO")])
        header_set = set(vcf_header)
        # compare with thresholds
        for j in range(len(fields)):
            if ( fields[j] in header_set ):
                if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ):
                    if ( not greaterthan[j] ):
                        return False
                    else:
                        continue
                else:
                    if ( greaterthan[j] ):
                        return False
                    else:
                        continue
            elif ( fields[j] in info_dict ):
                if ( float(info_dict[fields[j]]) > vals[j] ):
                    if ( not greaterthan[j] ):
                        return False
                    else:
                        continue
                else:
                    if ( greaterthan[j] ):
                        return False
                    else:
                        continue
            else:
                print("Field not found "+fields[j])
        return True

def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater):
    if ( DEBUG ):
        filtered_lines = 0
    else:
        out_vcf = open(this_vcf,'w')
        print("open for writing: "+this_vcf)
        wrotelines = 0
        for line in open(prev_vcf).readlines():
            if ( vcfLinePassesThreshold(line,fields,vals,greater) ):
                if ( DEBUG ):
                    filtered_lines = filtered_lines + 1
                else:                    
                    out_vcf.write(line)
                    wrotelines=wrotelines+1
        if ( DEBUG ):
            print(this_vcf+" would have filtered "+str(filtered_lines)+" variants")
        else:
            out_vcf.close()

# make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering )
if ( make_hard_threshold_vcf ):
    prior_step_extension = "_annotated"
    this_step_extension = "_hard_threshold"
    threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"]
    threshold_values = [15, 0.4,280]
    threshold_greaterthan = [True, False,True]
    for project in working_projects:
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            prev_vcf = homeVCF(project,pool,prior_step_extension)
            this_vcf = homeVCF(project,pool,this_step_extension)
            makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan)

# run snp selector on hard-threshold vcf files
if ( snp_select_threshold_vcf ):
    prior_step_extension = "_hard_threshold"
    this_step_extension = "_hard_snpsel"
    runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields)

# definitions for next few steps

def hapmapInfoFile(proj,pool,info,qscore):
    return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt")

def generateHapmapInfo(extension, qscorelist):
    gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo"
    for project in working_projects:
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            for infofield in working_info_fields:
                inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield)
                inputBam = pipelineBam(project, sample)
                ps = str(poolSize(project))
                for qscore in qscorelist:
                    outputFile = hapmapInfoFile(project, pool, infofield, qscore)
                    gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore
                    subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args)

# generate hapmap (false postive/false negative) info files on filtered vcfs
if ( generate_hapmap_info ):
    prior_step_extension = "_snpsel"
    generateHapmapInfo(prior_step_extension, working_quality_scores)

if ( generate_threshold_hapmap_info ):
    prior_step_extension = "_hard_snpsel"
    generateHapmapInfo(prior_step_extension, working_quality_scores)

# definitions for next section
def falsenegSnpSelect(invcf, false_neg_vcf, qscore):
    tmp_vcf = "tmp.vcf"
    tmp_recal_vcf = "tmp2.vcf"
    tmp_log = "tmp.log"
    snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf
    cmd = "cat "+invcf+" | awk '{if ( substr($1,0,1) == '#' || $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' | sed 's/ /\t/g' > "+tmp.vcf
    subSystem(cmd)
    subSystem(snpsel_base)
    cmd = "rm "+tmp_vcf
    subSystem(cmd)
    cmd = "rm "+tmp_recal_vcf
    subSystem(cmd)
    cmd = "rm "+tmp_log
    subSystem(cmd)

def falseNegsBySnpSelector(extension):
    for project in working_projects:
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            for infofield in working_info_fields:
                input_vcf = homeVCF(project,pool,extension+"_"+infofield)
                for qscore in working_quality_scores:
                    false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf")
                    falsenegSnpSelect(input_vcf,false_neg_vcf,qscore)

# generate false negative sites in a vcf file via snp selector
if ( false_negatives_by_snp_selector):
    prior_step_extension = "_snpsel"
    falseNegsBySnpSelector(prior_step_extension)

# generate false negative sites ina  vcf file on the thresholded vcfs
if ( false_negatives_by_snp_selector_on_threshold ):
    prior_step_extension = "_hard_snpsel"
    falseNegsBySnpSelector(prior_step_extension)

# definition for matlab section
def makeMatlabMetricPlot(extension):
        for project in working_projects:
            sample_list = projects_to_samples[project]
            for sample in sample_list:
                pool = samples_to_pool_name[sample]
                for infofield in working_info_fields:
                    log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log")
                    figureName = figurePath(project,pool)+pool+extension
                    cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\""
                    subSystem(cmd)

# make matlab plots of the metrics (generated from the log files)
if ( plot_info_field_metrics ):
    prior_step_extension = '_snpsel'
    makeMatlabMetricPlot(prior_step_extension)

# make matlab plots of the metrics from the hard-thresholded log files
if ( plot_info_field_metrics_threshold ):
    prior_step_extension = "_hard_snpsel"
    makeMatlabMetricPlot(prior_step_extension)

# definitions for best effort section

def bestEffortVCF(proj):
    bevcf_path = homeVCF(proj,proj,"_best_effort")
    #bevcf = open(bevcf_path,'w')
    #bevcf.write("##source=best_effort_vcf")
    #bevcf.write("##format=VCRv3.2")
    #bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls")
    #bevcf.close()
    return bevcf_path

def keyFromFields(fields):
    return ":".join([fields[0],fields[1]])

def inVCFDict(vdict, fields):
    return ( keyFromFields(fields) in vdict )

def addToVCFDict(vdict, fields):
    vdict[keyFromFields(fields)] = fields
    return vdict

def updateInfoField(key, info1, info2):
    if ( key == "HRun" ): # this one doesn't change
        return [key, str(info1)]
    else:
        return [key, str( (float(info1) + float(info2))/2 )]

def incorporateNewInfo(vdict, infodict, fields):
    addNPools = True
    poskey = keyFromFields(fields)
    vline = vdict[poskey]
    vinfodict = parseInfoField(vline[vcf_header.index("INFO")])
    newInfo = []
    all_keys = set(vinfodict.keys()).union(set(infodict.keys()))
    for infokey in all_keys:
        if ( infokey in vinfodict and infokey in infodict ):
            newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) )
        elif ( infokey in infodict ):
            newInfo.append("=".join([infokey, str(infodict[infokey])]) )
        elif ( infokey == "nPools" ):
            newPools = str( int(vinfodict[infokey]) + 1 )
            newInfo.append("=".join([infokey,newPools]))
            addNPools = False
        else:
            continue
    if ( addNPools ):
        newInfo.append("nPools=1")
    vline[vcf_header.index("INFO")] = ";".join(newInfo)
    vdict[poskey] = vline
    return vdict

def incorporateQuality(vdict, fields):
    poskey = keyFromFields(fields)
    vfields = vdict[poskey]
    qual_index = vcf_header.index("QUAL")
    vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) )
    vdict[poskey] = vfields
    return vdict

def bestEffortMerge(call_dict,vcf_to_merge_path):
    for line in open(vcf_to_merge_path).readlines():
        if ( line.startswith("#") ):
            continue
        else:
            vcf_fields = line.strip().split()
            if ( not inVCFDict(call_dict, vcf_fields) ):
                call_dict = addToVCFDict(call_dict,vcf_fields)
            else:
                vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")])
                call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields)
                call_dict = incorporateQuality(call_dict,vcf_fields)
    return call_dict

def dictToFile(vdict, filepath,source,samplename):
    file = open(filepath,'w')
    file.write("##source="+source)
    file.write("##version=VCRv3.2")
    file.write("#"+"\t".join(vcf_header)+"\t"+samplename)
    if ( vdict ):
        for key in vdict.keys():
            file.write("\t".join(vdict[key])+"\n")
        file.close()
    else:
        print("dict had no keys for "+filepath)

# make the best effort VCF
if ( make_best_effort_vcf ):
    best_thresh_fields = ["QUAL"]
    best_thresh_val = [7]
    best_thresh_gt = [True]
    prev_extension = "_annotated"
    for project in working_projects:
        best_effort_vcf = bestEffortVCF(project)
        best_effort_dict = dict()
        sample_list = projects_to_samples[project]
        for sample in sample_list:
            pool = samples_to_pool_name[sample]
            in_vcf = homeVCF(project,pool,prev_extension)
            outvcf = homeVCF(project,pool,"_temporary")
            makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt)
            print("dict about to update, size is currently: "+str(len(best_effort_dict)))
            best_effort_dict = bestEffortMerge(best_effort_dict,outvcf)
            print("updated size: "+str(len(best_effort_dict)))
            subSystem("rm "+outvcf)
        dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort")

if ( snp_select_best_effort_vcf ):
    snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields)
    for project in working_project:
        best_effort_vcf = bestEffortVCF(project)
        snp_recal_vcf = homeVCF(project,project,"_best_effort_selected")
        log = annotationProjectFile(project,project,"_best_effort_selected.log")
Adding this to subversion so it's protected git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2299 348d0f76-0448-11de-a6fe-93d51630548a 2009-12-10 05:26:17 +08:00			`#!/usr/bin/env python`
			`# imports`
			`import subprocess`
			`import os`
			`import sys`
			`import re`
			`import time`
			`import math`
			`import codecs`
			`## script keys and triggers`
			`remove_all_files = False # be very careful with this one`
			`make_directories = True`
			`copy_vcf_files = True`
			`annotate_vcf_files = True`
			`run_snp_selector = False`
			`make_hard_threshold_vcf = False`
			`snp_select_threshold_vcf = False`
			`generate_hapmap_info = False`
			`generate_threshold_hapmap_info = False`
			`false_negatives_by_snp_selector = False`
			`false_negatives_by_snp_selector_on_threshold = False`
			`plot_info_field_metrics = False`
			`plot_info_field_metrics_threshold = False`
			`make_best_effort_vcf = False`
			`snp_select_best_effort_vcf = False`
			`plot_best_effort_metrics = False`


			`## global stuff`
			`DEBUG = True`
			`override_pilot_bamfile = True`
			`override_production_bamfile = True`
			`matlab_dir = "/humgen/gsa-scr1/projects/FHS/scripts"`
			`pipeline_samples_info_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/samples.tsv"`
			`samples_pipeline_path = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"`
			`home_dir = "/humgen/gsa-scr1/projects/FHS/"`
			`annotations_data_base = home_dir + "oneOffAnalyses/annotationEffectiveness/data/"`
			`figures_base = home_dir + "oneOffAnalyses/annotationEffectiveness/figures/"`
			`production_calls_dir = home_dir + "production/raw_production_calls_12_07/"`
			`pilot_calls_dir = home_dir + "pilot/calls/raw_pilot_calls_12_07/"`
			`hapmap_pool_info_dir = home_dir+"pilot/project_info/pilot_pool_information/"`
			`pilot_standard_project_name = "framinghampilot"`
			`pilot_clipped_project_name = "pilot_clipped"`
			`pilot_no_gatk_project_name = "pilot_no_gatk"`
			`FHS_project_name = "FHS"`
			`FHS_round_2_project_name = "FHS_round_2"`
			`all_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name,FHS_project_name,FHS_round_2_project_name]`
			`pilot_projects = [pilot_standard_project_name,pilot_clipped_project_name,pilot_no_gatk_project_name]`
			`production_projects = [FHS_project_name,FHS_round_2_project_name]`

			`gatk_cmd_base = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -L /humgen/gsa-scr1/projects/FHS/interval_lists/FHS_exons_only.interval_list"`
			`dbsnp_129_path = "/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod"`

			`vcf_header = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]`

			`def subSystem(cmd):`
			`if ( DEBUG ):`
			`print(cmd)`
			`else:`
			`os.system(cmd)`

			`def safeRemove(cmd):`
			`if ( not ( cmd.startswith("rm -rf "+annotations_data_base) or cmd.startswith("rm -rf "+production_calls_dir) or cmd.startswith("rm -rf "+pilot_calls_dir) or cmd.startswith("rm -rf "+figures_base) ) ):`
			`print("Unsafe removal attempt: "+cmd)`
			`else:`
			`subSystem(cmd)`

			`def productionBamfilePicard(proj, samp):`
			`for line in open("/humgen/gsa-hphome1/flannick/pfizer/pspipeline/meta/framingham.flowcell_lanes.tsv").readlines():`
			`spline = line.strip().split()`
			`if ( spline[1] == proj and spline[2] == samp ):`
			`return spline[5]+spline[3]+"."+spline[4]+".aligned.bam"`
			`print("no return for: "+proj+"_"+samp)`

			`def annotationProjectPath(proj):`
			`return annotations_data_base+proj+"/"`

			`def annotationPath(proj,pool):`
			`return annotationProjectPath(proj)+pool+"/"`

			`def annotationProjectFile(proj, pool, appender):`
			`return annotationPath(proj,pool)+pool+appender`

			`def figureProjectPath(proj):`
			`return figures_base+proj+"/"`

			`def figurePath(proj, samp):`
			`return figureProjectPath(proj)+samp+"/"`

			`def pipelineProjectPath(proj):`
			`return samples_pipeline_path+proj+"/"`

			`def pipelinePath(proj, samp):`
			`return pipelineProjectPath(proj)+samp+"/"`

			`def pipelineBam(proj, samp):`
			`if ( proj == "framinghampilot" and override_pilot_bamfile ):`
			`if ( samp == "CEPH1" ):`
			`return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/1/Solexa-10930/302JDAAXX.1.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/5/Solexa-10933/302JDAAXX.5.aligned.bam"`
			`if ( samp == "CEPH2" ):`
			`return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/2/Solexa-10931/302JDAAXX.2.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/8/Solexa-10934/302JDAAXX.8.aligned.bam"`
			`if ( samp == "CEPH3" ):`
			`return "/seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/3/Solexa-10932/302JDAAXX.3.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/6/Solexa-10936/302JDAAXX.6.aligned.bam -I /seq/picard/302JDAAXX/C1-252_2009-04-30_2009-11-13/7/Solexa-10935/302JDAAXX.7.aligned.bam"`

			`if ( proj in production_projects and override_production_bamfile ):`
			`return productionBamfilePicard(proj,samp)`

			`return pipelinePath(proj,samp)+proj+"."+samp+".bam"`

			`def pipelineVCF(proj,samp):`
			`return pipelinePath(proj,samp)+samp+".vcf"`

			`def homePath(proj, samp):`
			`if ( proj in pilot_projects ):`
			`return pilot_calls_dir+proj+"/"`
			`else:`
			`return production_calls_dir+proj+"/"`

			`def homeProjectFile(proj, samp, appender):`
			`return homePath(proj,samp)+samp+appender`

			`def homeVCF(proj, samp, extension):`
			`return homeProjectFile(proj,samp,extension)+".vcf"`

			`def hapmapVCF(proj, samp):`
			`return "/humgen/gsa-scr1/projects/FHS/pilot/analysis/"+samp.lower()+"_hapmap_snp_only.vcf"`

			`def poolSize(proj):`
			`if ( proj in pilot_projects ):`
			`return 40`
			`else:`
			`return 28`

			`def poolBinding(proj, samp):`
			`if ( proj in production_projects ):`
			`raise Exception, "Production projects do not have hapmap pool bindings"`
			`else:`
			`return hapmap_pool_info_dir+samp+".pool.path"`

			`def poolSampleNames(proj,samp):`
			`if ( proj in production_projects ):`
			`raise Exception, "Production projects do not have hapmap pool bindings"`
			`else:`
			`return hapmap_pool_info_dir+samp+".pool"`


			`## working stuff`

			`#working_projects = [pilot_standard_project_name]`
			`working_projects = [FHS_round_2_project_name]`
			`working_info_fields = ["syzy_SB","syzy_DP","syzy_NMMR","HRun"]`
			`working_quality_scores = ["0","1","2","3","4","5"]`
			`working_samples = set()`
			`projects_to_samples = []`

			`# create the project --> list of samples dictionary`

			`for project in working_projects:`
			`sample_list = os.listdir(pipelineProjectPath(project))`
			`projects_to_samples.append([project, sample_list])`
			`for s in sample_list:`
			`working_samples.add(s)`

			`projects_to_samples = dict(projects_to_samples)`

			`# create the sample --> pool name dictionary`

			`samples_to_pool_name = []`

			`for line in open(pipeline_samples_info_path).readlines():`
			`spline = line.strip().split()`
			`if(spline[0] in working_samples):`
			`keyval = [spline[0], spline[1]]`
			`samples_to_pool_name.append(keyval)`

			`samples_to_pool_name = dict(samples_to_pool_name)`

			`# remove everything`
			`if ( remove_all_files ):`
			`for project in working_projects:`
			`safeRemove("rm -rf "+homePath(project,"") )`
			`safeRemove("rm -rf "+annotationProjectPath(project))`
			`safeRemove("rm -rf "+figureProjectPath(project))`

			`# make directories if necessary`
			`if ( make_directories ):`
			`for project in working_projects:`
			`subSystem("mkdir "+homePath(project,""))`
			`subSystem("mkdir "+annotationProjectPath(project))`
			`subSystem("mkdir "+figureProjectPath(project))`
			`for sample in projects_to_samples[project]:`
			`subSystem("mkdir "+annotationPath(project, samples_to_pool_name[sample]))`
			`subSystem("mkdir "+figurePath(project,samples_to_pool_name[sample]))`

			`# copy the vcf files from the pipeline into the /FHS/ directory`
			`if ( copy_vcf_files ):`
			`for project in working_projects:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`sample_home_vcf = homeVCF(project,samples_to_pool_name[sample],"_raw")`
			`pipeline_vcf = pipelineVCF(project,sample)`
			`subSystem("cat "+pipeline_vcf+" \| sed 's/"+sample+"/"+pool+"/g' > "+sample_home_vcf)`

			`# annotate the vcf files using VariantAnnotator`
			`if ( annotate_vcf_files ):`
			`prior_step_extension = "_raw"`
			`this_step_extension = "_annotated"`
			`gatk_annotate_base = gatk_cmd_base + " -T VariantAnnotator -exp -D "+dbsnp_129_path`
			`for project in working_projects:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`inputVCF = homeVCF(project, pool, prior_step_extension)`
			`outputVCF = homeVCF(project, pool, this_step_extension)`
			`bamfile = pipelineBam(project, sample)`
			`gatk_args = " -I "+bamfile+" -B variant,VCF,"+inputVCF+" -vcf "+outputVCF`
			`subSystem("bsub -q gsa "+gatk_annotate_base + gatk_args)`

			`# definition for next section (and another one further down)`

			`def runSnpSelector(project_list,prior_step_annotation,this_step_annotation,info_field_list):`
			`snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 10 --plottable"`
			`for project in project_list:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`if ( project in pilot_projects ):`
			`truth_arg = " -t "+hapmapVCF(project,sample)`
			`else:`
			`truth_arg = " -titv=3.6"`
			`for info_field in info_field_list:`
			`input_vcf = homeVCF(project,pool,prior_step_extension)`
			`output_vcf = homeVCF(project,pool,this_step_annotation+"_"+info_field)`
			`log_output = annotationProjectFile(project,pool,this_step_annotation+"_"+info_field+".log")`
			`snp_selector_args = truth_arg+" -o "+output_vcf + " -l "+log_output + " -f "+info_field+" "+input_vcf`
			`subSystem(snp_selector_base+snp_selector_args)`

			`# run snp selector to analyze info fields`
			`if ( run_snp_selector ):`
			`prior_step_extension = "_annotated"`
			`this_step_extension = "_snpsel"`
			`runSnpSelector(working_projects,prior_step_extension, this_step_extension, working_info_fields)`

			`# definition for the next stage`

			`def parseInfoField(info):`
			`info_list = info.split(";")`
			`info_dict = []`
			`for info_field in info_list:`
			`keyval = info_field.split("=")`
			`key = keyval[0]`
			`val = float(keyval[1])`
			`info_dict.append([key, val])`
			`return dict(info_dict)`


			`def vcfLinePassesThreshold(line, fields, vals,greaterthan):`
			`if( line.startswith('#') ):`
			`return True`
			`else:`
			`# parse the line`
			`general_fields = line.split()`
			`info_dict = parseInfoField(general_fields[vcf_header.index("INFO")])`
			`header_set = set(vcf_header)`
			`# compare with thresholds`
			`for j in range(len(fields)):`
			`if ( fields[j] in header_set ):`
			`if ( float(general_fields[vcf_header.index(fields[j])]) > vals[j] ):`
			`if ( not greaterthan[j] ):`
			`return False`
			`else:`
			`continue`
			`else:`
			`if ( greaterthan[j] ):`
			`return False`
			`else:`
			`continue`
			`elif ( fields[j] in info_dict ):`
			`if ( float(info_dict[fields[j]]) > vals[j] ):`
			`if ( not greaterthan[j] ):`
			`return False`
			`else:`
			`continue`
			`else:`
			`if ( greaterthan[j] ):`
			`return False`
			`else:`
			`continue`
			`else:`
			`print("Field not found "+fields[j])`
			`return True`

			`def makeThresholdVCF(prev_vcf,this_vcf,fields,vals,greater):`
			`if ( DEBUG ):`
			`filtered_lines = 0`
			`else:`
			`out_vcf = open(this_vcf,'w')`
			`print("open for writing: "+this_vcf)`
			`wrotelines = 0`
			`for line in open(prev_vcf).readlines():`
			`if ( vcfLinePassesThreshold(line,fields,vals,greater) ):`
			`if ( DEBUG ):`
			`filtered_lines = filtered_lines + 1`
			`else:`
			`out_vcf.write(line)`
			`wrotelines=wrotelines+1`
			`if ( DEBUG ):`
			`print(this_vcf+" would have filtered "+str(filtered_lines)+" variants")`
			`else:`
			`out_vcf.close()`

			`# make hard-threshold vcf files ( to see if a more strenuous set with less noise helps snp selector determine an ordering )`
			`if ( make_hard_threshold_vcf ):`
			`prior_step_extension = "_annotated"`
			`this_step_extension = "_hard_threshold"`
			`threshold_fields = ["QUAL", "syzy_NMMR","syzy_DP"]`
			`threshold_values = [15, 0.4,280]`
			`threshold_greaterthan = [True, False,True]`
			`for project in working_projects:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`prev_vcf = homeVCF(project,pool,prior_step_extension)`
			`this_vcf = homeVCF(project,pool,this_step_extension)`
			`makeThresholdVCF(prev_vcf,this_vcf,threshold_fields,threshold_values,greaterthan)`

			`# run snp selector on hard-threshold vcf files`
			`if ( snp_select_threshold_vcf ):`
			`prior_step_extension = "_hard_threshold"`
			`this_step_extension = "_hard_snpsel"`
			`runSnpSelector(working_projects, prior_step_extension, this_step_extension, working_info_fields)`

			`# definitions for next few steps`

			`def hapmapInfoFile(proj,pool,info,qscore):`
			`return annotationProjectFile(project,pool,"_"+info+"_filter_at_q_"+qscore+"_hapmap_info.txt")`

			`def generateHapmapInfo(extension, qscorelist):`
			`gatk_hapmap_info = gatk_cmd_base+" -T HapmapPoolAllelicInfo"`
			`for project in working_projects:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`for infofield in working_info_fields:`
			`inputVCF = homeVCF(project,pool,prior_step_extension+"_"+infofield)`
			`inputBam = pipelineBam(project, sample)`
			`ps = str(poolSize(project))`
			`for qscore in qscorelist:`
			`outputFile = hapmapInfoFile(project, pool, infofield, qscore)`
			`gatk_args = " -ps "+ps+" -I "+inputBam+" -of "+outputFile+" -B "+pool+",VCF,"+inputVCF+" -B "+poolBindings(project,sample)+" -samples "+poolSampleNames(project,sample)+" -q "+qscore`
			`subSystem("bsub -q gsa "+gatk_hapmap_info+gatk_args)`

			`# generate hapmap (false postive/false negative) info files on filtered vcfs`
			`if ( generate_hapmap_info ):`
			`prior_step_extension = "_snpsel"`
			`generateHapmapInfo(prior_step_extension, working_quality_scores)`

			`if ( generate_threshold_hapmap_info ):`
			`prior_step_extension = "_hard_snpsel"`
			`generateHapmapInfo(prior_step_extension, working_quality_scores)`

			`# definitions for next section`
			`def falsenegSnpSelect(invcf, false_neg_vcf, qscore):`
			`tmp_vcf = "tmp.vcf"`
			`tmp_recal_vcf = "tmp2.vcf"`
			`tmp_log = "tmp.log"`
			`snpsel_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 30 --plottable -l "+tmp_log+" -o "+tmp_recal_vcf+" --FNOutputVCF="+false_neg_vcf+" "+tmp_vcf`
			`cmd = "cat "+invcf+" \| awk '{if ( substr($1,0,1) == '#' \|\| $5 > "+qscore+" ) print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' \| sed 's/ /\t/g' > "+tmp.vcf`
			`subSystem(cmd)`
			`subSystem(snpsel_base)`
			`cmd = "rm "+tmp_vcf`
			`subSystem(cmd)`
			`cmd = "rm "+tmp_recal_vcf`
			`subSystem(cmd)`
			`cmd = "rm "+tmp_log`
			`subSystem(cmd)`

			`def falseNegsBySnpSelector(extension):`
			`for project in working_projects:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`for infofield in working_info_fields:`
			`input_vcf = homeVCF(project,pool,extension+"_"+infofield)`
			`for qscore in working_quality_scores:`
			`false_neg_vcf = annotationProjectFile(project, pool, "_"+infofield+"_q_"+qscore+"_false_negs.vcf")`
			`falsenegSnpSelect(input_vcf,false_neg_vcf,qscore)`

			`# generate false negative sites in a vcf file via snp selector`
			`if ( false_negatives_by_snp_selector):`
			`prior_step_extension = "_snpsel"`
			`falseNegsBySnpSelector(prior_step_extension)`

			`# generate false negative sites ina vcf file on the thresholded vcfs`
			`if ( false_negatives_by_snp_selector_on_threshold ):`
			`prior_step_extension = "_hard_snpsel"`
			`falseNegsBySnpSelector(prior_step_extension)`

			`# definition for matlab section`
			`def makeMatlabMetricPlot(extension):`
			`for project in working_projects:`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`for infofield in working_info_fields:`
			`log_output = annotationProjectFile(project,pool,extension+"_"+infofield+".log")`
			`figureName = figurePath(project,pool)+pool+extension`
			`cmd = "matlab -r \"cd "+matlab_dir+" , snpSelectorPlots('"+log_output+"','"+figureName+"') , exit\""`
			`subSystem(cmd)`

			`# make matlab plots of the metrics (generated from the log files)`
			`if ( plot_info_field_metrics ):`
			`prior_step_extension = '_snpsel'`
			`makeMatlabMetricPlot(prior_step_extension)`

			`# make matlab plots of the metrics from the hard-thresholded log files`
			`if ( plot_info_field_metrics_threshold ):`
			`prior_step_extension = "_hard_snpsel"`
			`makeMatlabMetricPlot(prior_step_extension)`

			`# definitions for best effort section`

			`def bestEffortVCF(proj):`
			`bevcf_path = homeVCF(proj,proj,"_best_effort")`
			`#bevcf = open(bevcf_path,'w')`
			`#bevcf.write("##source=best_effort_vcf")`
			`#bevcf.write("##format=VCRv3.2")`
			`#bevcf.write("#"+"\t".join(vcf_header)+"\t"+proj+"_combined_calls")`
			`#bevcf.close()`
			`return bevcf_path`

			`def keyFromFields(fields):`
			`return ":".join([fields[0],fields[1]])`

			`def inVCFDict(vdict, fields):`
			`return ( keyFromFields(fields) in vdict )`

			`def addToVCFDict(vdict, fields):`
			`vdict[keyFromFields(fields)] = fields`
			`return vdict`

			`def updateInfoField(key, info1, info2):`
			`if ( key == "HRun" ): # this one doesn't change`
			`return [key, str(info1)]`
			`else:`
			`return [key, str( (float(info1) + float(info2))/2 )]`

			`def incorporateNewInfo(vdict, infodict, fields):`
			`addNPools = True`
			`poskey = keyFromFields(fields)`
			`vline = vdict[poskey]`
			`vinfodict = parseInfoField(vline[vcf_header.index("INFO")])`
			`newInfo = []`
			`all_keys = set(vinfodict.keys()).union(set(infodict.keys()))`
			`for infokey in all_keys:`
			`if ( infokey in vinfodict and infokey in infodict ):`
			`newInfo.append("=".join(updateInfoField(infokey, vinfodict[infokey], infodict[infokey]) ) )`
			`elif ( infokey in infodict ):`
			`newInfo.append("=".join([infokey, str(infodict[infokey])]) )`
			`elif ( infokey == "nPools" ):`
			`newPools = str( int(vinfodict[infokey]) + 1 )`
			`newInfo.append("=".join([infokey,newPools]))`
			`addNPools = False`
			`else:`
			`continue`
			`if ( addNPools ):`
			`newInfo.append("nPools=1")`
			`vline[vcf_header.index("INFO")] = ";".join(newInfo)`
			`vdict[poskey] = vline`
			`return vdict`

			`def incorporateQuality(vdict, fields):`
			`poskey = keyFromFields(fields)`
			`vfields = vdict[poskey]`
			`qual_index = vcf_header.index("QUAL")`
			`vfields[qual_index] = str( float(vfields[qual_index]) + float(fields[qual_index]) )`
			`vdict[poskey] = vfields`
			`return vdict`

			`def bestEffortMerge(call_dict,vcf_to_merge_path):`
			`for line in open(vcf_to_merge_path).readlines():`
			`if ( line.startswith("#") ):`
			`continue`
			`else:`
			`vcf_fields = line.strip().split()`
			`if ( not inVCFDict(call_dict, vcf_fields) ):`
			`call_dict = addToVCFDict(call_dict,vcf_fields)`
			`else:`
			`vcf_info_dict = parseInfoField(vcf_fields[vcf_header.index("INFO")])`
			`call_dict = incorporateNewInfo(call_dict,vcf_info_dict,vcf_fields)`
			`call_dict = incorporateQuality(call_dict,vcf_fields)`
			`return call_dict`

			`def dictToFile(vdict, filepath,source,samplename):`
			`file = open(filepath,'w')`
			`file.write("##source="+source)`
			`file.write("##version=VCRv3.2")`
			`file.write("#"+"\t".join(vcf_header)+"\t"+samplename)`
			`if ( vdict ):`
			`for key in vdict.keys():`
			`file.write("\t".join(vdict[key])+"\n")`
			`file.close()`
			`else:`
			`print("dict had no keys for "+filepath)`

			`# make the best effort VCF`
			`if ( make_best_effort_vcf ):`
			`best_thresh_fields = ["QUAL"]`
			`best_thresh_val = [7]`
			`best_thresh_gt = [True]`
			`prev_extension = "_annotated"`
			`for project in working_projects:`
			`best_effort_vcf = bestEffortVCF(project)`
			`best_effort_dict = dict()`
			`sample_list = projects_to_samples[project]`
			`for sample in sample_list:`
			`pool = samples_to_pool_name[sample]`
			`in_vcf = homeVCF(project,pool,prev_extension)`
			`outvcf = homeVCF(project,pool,"_temporary")`
			`makeThresholdVCF(in_vcf,outvcf,best_thresh_fields,best_thresh_val,best_thresh_gt)`
			`print("dict about to update, size is currently: "+str(len(best_effort_dict)))`
			`best_effort_dict = bestEffortMerge(best_effort_dict,outvcf)`
			`print("updated size: "+str(len(best_effort_dict)))`
			`subSystem("rm "+outvcf)`
			`dictToFile(best_effort_dict,best_effort_vcf,"make_best_effort_vcf",project+"_best_effort")`

			`if ( snp_select_best_effort_vcf ):`
			`snp_selector_base = "python /humgen/gsa-scr1/chartl/sting/python/snpSelector.py -p 20 --plottable -f "+",".join(working_info_fields)`
			`for project in working_project:`
			`best_effort_vcf = bestEffortVCF(project)`
			`snp_recal_vcf = homeVCF(project,project,"_best_effort_selected")`
			`log = annotationProjectFile(project,project,"_best_effort_selected.log")`