gatk-3.8/python/expandedSummaryToVCF.py

#!/usr/bin/env python
#import farm_commands
import os.path
import sys
import re
import time
import math

def grepFile(string, file):
    return grep(string, file.readlines())

def grep(string, list):
    expr = re.compile(string)
    return [elem for elem in list if expr.match(elem)]

def dictFromCombinedErrorCoverageFile(f):
    dictList = []
    for line in f:
        ln = line.split()
        key = ln[0]
        item = ln
        dictList.append([key,item])
    return dict(dictList)

def qualFromLod(L):
    try: 
        X = math.exp(-L)
    except OverflowError:
        return 0
    try:
        return math.floor(-10*math.log10(X/1+X))
    except OverflowError:
        return 1000

callFileStr = sys.argv[1]
callFile = open(callFileStr)
pipelineSamplesFile = open(sys.argv[2])
directory = os.getcwd()
syzygyPathSamples = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"
sortByRefPath = "/humgen/gsa-hphome1/chartl/sting/perl/sortByRef.pl"

poolNames = []
poolInternalIDs = []
poolCombinedErrorCoverageCalls = []
proj = "" # required for an output file
for line in pipelineSamplesFile:
    ln = line.strip('\n')
    ln = ln.split(";")
    #ln = line.split("\t") # -depends on format
    poolNames.append(ln.pop(2))
    piid = ln.pop(0)
    poolInternalIDs.append(piid)
    proj = ln.pop(0)
    ceccPath = syzygyPathSamples+proj+"/"+piid+"/"+proj+"."+piid+".bam.combined.error.coverage.calls"
    print("reading: "+ceccPath)
    poolCombinedErrorCoverageCalls.append(dictFromCombinedErrorCoverageFile(open(ceccPath)))

pooledOutputFiles = []
for pool in poolNames:
    pooledOutputFiles.append(open(directory+"/"+pool+"_calls.vcf",'w'))

pooledCallsFile = open(directory+"/"+proj+"_combined_calls.vcf",'w')

header1 = "##format=PCFv1\n"
header2 = "##filedate="+time.strftime("%Y%m%d")+"\n"
header3 = "##source=expandedSummaryToVCF:"+callFileStr+"\n"
header4 = "##reference=Homo_sapiens_assembly18\n"
header5 = "##phasing=pooled\n"
header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"+"\t"+"\t".join(poolNames)+"\n" #note: really cool method

# print header
pooledCallsFile.write(header1)
pooledCallsFile.write(header2)
pooledCallsFile.write(header3)
pooledCallsFile.write(header4)
pooledCallsFile.write(header5)
pooledCallsFile.write(header6)

# print rest of headers

for i in range(len(poolNames)):
    header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
    pooledOutputFiles[i].write(header1)
    pooledOutputFiles[i].write(header2)
    pooledOutputFiles[i].write(header3)
    pooledOutputFiles[i].write(header4)
    pooledOutputFiles[i].write(header5)
    pooledOutputFiles[i].write(header6)

for line in callFile:
# note: file is a .csv; so comma delimited with headers
# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ...
# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ...
# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools

# call file assumed already to have been split by pool

# pileupFile is a .bam.coverage file from the pooled pipeline

# call file isn't sorted by chr:position 
# make the header strings

    if line.startswith("chr:pos"):
        continue
    elif not line.startswith("chr"):
        continue
    else:
        #print(line)
        entries = line.split(",")
        chrompos = entries[0]
        alleles = entries[1]
        variant = alleles[1]
        ref = alleles[0]
        dbSNP = entries[4]
        if dbSNP:
            pass
        else:
            dbSNP="."

        supportingPools = entries.pop(62).rstrip(']').lstrip('[')
        supportingPools = supportingPools.split(";")
        total_quality = 0
        total_slod = 0
        total_depth = 0
        quality_by_pool = []
        slod_by_pool = []
        depth_by_pool = []
        #sys.exit()
        for i in range(len(poolNames)):
            #grab line from the correct dict
            depth = 0;
            slod = 0;
            qual = 0;
            try:
                ceccLine = poolCombinedErrorCoverageCalls[i][chrompos]
                depth = ceccLine[18]
                qual=qualFromLod(float(ceccLine[21]))
                if ceccLine[22] == "NA":
                    slod = 0;
                else:
                    try:
                        slod = math.log10(float(ceccLine[23]))
                    except OverflowError:
                        slod = -1000;
            except KeyError:
                # do nothing
                pass
            #print this out to the file
            chromsplit = chrompos.split(":")
            outstr=chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(qual)+"\t0\t"+"DP="+str(depth)+";SB="+str(slod)+"\n"
            pooledOutputFiles[i].write(outstr)
            #now update data
            total_slod = total_slod + float(slod)
            total_depth = total_depth + int(depth)
            total_quality = total_quality + qual
            depth_by_pool.append(depth)
            slod_by_pool.append(slod)
            quality_by_pool.append(qual)
        #now for the pooled file
        chromsplit=chrompos.split(":")
        outstr = chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(total_quality)+"\t0\t"+"DP="+str(total_depth)+";SB="+str(total_slod)+";NP="+str(len(supportingPools))+"\tGT:GQ:DP:SB"
        pooledCallsFile.write(outstr)
        #propagate individual pool information
        for i in range(len(poolNames)):
            phase = "0/0"
            if grep(poolNames[i],supportingPools):
                phase = "0/1"
            else:
                phase = "0/0"
            pooledOut="\t"+phase+":"+str(quality_by_pool[i])+":"+str(depth_by_pool[i])+":"+str(slod_by_pool[i])
            pooledCallsFile.write(pooledOut)
        pooledCallsFile.write("\n")

## close all files ##

pooledCallsFile.close()

for i in range(len(poolNames)):
    pooledOutputFiles[i].close()
Added - converter from expanded summary to VCF (beautiful thing, really) Removed - the ugly hackjob that was expanded summary to Geli git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1970 348d0f76-0448-11de-a6fe-93d51630548a 2009-11-04 06:20:47 +08:00			`#!/usr/bin/env python`
			`#import farm_commands`
			`import os.path`
			`import sys`
			`import re`
			`import time`
			`import math`

			`def grepFile(string, file):`
			`return grep(string, file.readlines())`

			`def grep(string, list):`
			`expr = re.compile(string)`
			`return [elem for elem in list if expr.match(elem)]`

			`def dictFromCombinedErrorCoverageFile(f):`
			`dictList = []`
			`for line in f:`
			`ln = line.split()`
			`key = ln[0]`
			`item = ln`
			`dictList.append([key,item])`
			`return dict(dictList)`

			`def qualFromLod(L):`
Put this puppy through its paces. Eliminated the sorting and header-handling stuff; that isn't the purvey of this script and should be handled downstream or by a script wrapper. I also secretly handled another pesky overlow exception. Occasionally Syzygy could report lods of like -1000; e.g. posterior probabilities of one in one (((googol) googol) googol) googol which of course makes python blow up. Now we safely output an accurate posterior. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1971 348d0f76-0448-11de-a6fe-93d51630548a 2009-11-04 14:05:45 +08:00			`try:`
			`X = math.exp(-L)`
			`except OverflowError:`
			`return 0`
Added - converter from expanded summary to VCF (beautiful thing, really) Removed - the ugly hackjob that was expanded summary to Geli git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1970 348d0f76-0448-11de-a6fe-93d51630548a 2009-11-04 06:20:47 +08:00			`try:`
			`return math.floor(-10*math.log10(X/1+X))`
			`except OverflowError:`
			`return 1000`

			`callFileStr = sys.argv[1]`
			`callFile = open(callFileStr)`
			`pipelineSamplesFile = open(sys.argv[2])`
			`directory = os.getcwd()`
			`syzygyPathSamples = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"`
			`sortByRefPath = "/humgen/gsa-hphome1/chartl/sting/perl/sortByRef.pl"`

			`poolNames = []`
			`poolInternalIDs = []`
			`poolCombinedErrorCoverageCalls = []`
			`proj = "" # required for an output file`
			`for line in pipelineSamplesFile:`
			`ln = line.strip('\n')`
			`ln = ln.split(";")`
			`#ln = line.split("\t") # -depends on format`
			`poolNames.append(ln.pop(2))`
			`piid = ln.pop(0)`
			`poolInternalIDs.append(piid)`
			`proj = ln.pop(0)`
			`ceccPath = syzygyPathSamples+proj+"/"+piid+"/"+proj+"."+piid+".bam.combined.error.coverage.calls"`
			`print("reading: "+ceccPath)`
			`poolCombinedErrorCoverageCalls.append(dictFromCombinedErrorCoverageFile(open(ceccPath)))`

			`pooledOutputFiles = []`
			`for pool in poolNames:`
			`pooledOutputFiles.append(open(directory+"/"+pool+"_calls.vcf",'w'))`

			`pooledCallsFile = open(directory+"/"+proj+"_combined_calls.vcf",'w')`

			`header1 = "##format=PCFv1\n"`
			`header2 = "##filedate="+time.strftime("%Y%m%d")+"\n"`
			`header3 = "##source=expandedSummaryToVCF:"+callFileStr+"\n"`
			`header4 = "##reference=Homo_sapiens_assembly18\n"`
			`header5 = "##phasing=pooled\n"`
			`header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"+"\t"+"\t".join(poolNames)+"\n" #note: really cool method`

			`# print header`
			`pooledCallsFile.write(header1)`
			`pooledCallsFile.write(header2)`
			`pooledCallsFile.write(header3)`
			`pooledCallsFile.write(header4)`
			`pooledCallsFile.write(header5)`
			`pooledCallsFile.write(header6)`

			`# print rest of headers`

			`for i in range(len(poolNames)):`
			`header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"`
			`pooledOutputFiles[i].write(header1)`
			`pooledOutputFiles[i].write(header2)`
			`pooledOutputFiles[i].write(header3)`
			`pooledOutputFiles[i].write(header4)`
			`pooledOutputFiles[i].write(header5)`
			`pooledOutputFiles[i].write(header6)`

			`for line in callFile:`
			`# note: file is a .csv; so comma delimited with headers`
			`# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ...`
			`# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ...`
			`# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools`

			`# call file assumed already to have been split by pool`

			`# pileupFile is a .bam.coverage file from the pooled pipeline`

			`# call file isn't sorted by chr:position`
			`# make the header strings`

			`if line.startswith("chr:pos"):`
			`continue`
			`elif not line.startswith("chr"):`
			`continue`
			`else:`
			`#print(line)`
			`entries = line.split(",")`
			`chrompos = entries[0]`
			`alleles = entries[1]`
			`variant = alleles[1]`
			`ref = alleles[0]`
			`dbSNP = entries[4]`
			`if dbSNP:`
			`pass`
			`else:`
			`dbSNP="."`

			`supportingPools = entries.pop(62).rstrip(']').lstrip('[')`
			`supportingPools = supportingPools.split(";")`
			`total_quality = 0`
			`total_slod = 0`
			`total_depth = 0`
			`quality_by_pool = []`
			`slod_by_pool = []`
			`depth_by_pool = []`
			`#sys.exit()`
			`for i in range(len(poolNames)):`
			`#grab line from the correct dict`
			`depth = 0;`
			`slod = 0;`
			`qual = 0;`
			`try:`
			`ceccLine = poolCombinedErrorCoverageCalls[i][chrompos]`
			`depth = ceccLine[18]`
			`qual=qualFromLod(float(ceccLine[21]))`
			`if ceccLine[22] == "NA":`
			`slod = 0;`
			`else:`
			`try:`
			`slod = math.log10(float(ceccLine[23]))`
			`except OverflowError:`
			`slod = -1000;`
			`except KeyError:`
			`# do nothing`
			`pass`
			`#print this out to the file`
			`chromsplit = chrompos.split(":")`
			`outstr=chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(qual)+"\t0\t"+"DP="+str(depth)+";SB="+str(slod)+"\n"`
			`pooledOutputFiles[i].write(outstr)`
			`#now update data`
			`total_slod = total_slod + float(slod)`
			`total_depth = total_depth + int(depth)`
			`total_quality = total_quality + qual`
			`depth_by_pool.append(depth)`
			`slod_by_pool.append(slod)`
			`quality_by_pool.append(qual)`
			`#now for the pooled file`
			`chromsplit=chrompos.split(":")`
			`outstr = chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(total_quality)+"\t0\t"+"DP="+str(total_depth)+";SB="+str(total_slod)+";NP="+str(len(supportingPools))+"\tGT:GQ:DP:SB"`
			`pooledCallsFile.write(outstr)`
			`#propagate individual pool information`
			`for i in range(len(poolNames)):`
			`phase = "0/0"`
			`if grep(poolNames[i],supportingPools):`
			`phase = "0/1"`
			`else:`
			`phase = "0/0"`
			`pooledOut="\t"+phase+":"+str(quality_by_pool[i])+":"+str(depth_by_pool[i])+":"+str(slod_by_pool[i])`
			`pooledCallsFile.write(pooledOut)`
			`pooledCallsFile.write("\n")`

			`## close all files ##`

			`pooledCallsFile.close()`

			`for i in range(len(poolNames)):`
			`pooledOutputFiles[i].close()`