Added - converter from expanded summary to VCF (beautiful thing, really)

Removed - the ugly hackjob that was expanded summary to Geli git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1970 348d0f76-0448-11de-a6fe-93d51630548a
2009-11-03 22:20:47 +00:00 · 2009-11-03 22:20:47 +00:00 · 3d9195f8b6
parent a545859c62
commit 3d9195f8b6
2 changed files with 191 additions and 73 deletions
--- a/python/expandedSummaryToGeli.py
+++ b/python/expandedSummaryToGeli.py
@ -1,73 +0,0 @@
 #!/usr/bin/env python
 import farm_commands
 import os.path
 import sys
 import re
 def grepFile(string, file):
    return grep(string, file.readlines())
 def grep(string, list):
    expr = re.compile(string)
    return [elem for elem in list if expr.match(elem)]
 callFile = open(sys.argv[1])
 pileupFileLines = open(sys.argv[2])
 callFileWithLodScores = open(sys.argv[3])
 pileupFileLines = pileupFileLines.readlines()
 lodFileLines = callFileWithLodScores.readlines()
 for line in callFile:
 # note: file is a .csv; so comma delimited with headers
 # chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ...
 # ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ...
 # ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools
 # call file assumed already to have been split by pool
 # pileupFile is a .bam.coverage file from the pooled pipeline
 # call file isn't sorted by chr:position 
    if line.startswith("chr:pos"):
        continue
    elif not line.startswith("chr"):
        continue
    else:
        #print(line)
        entries = line.split(",")
        chrompos = entries.pop(0) # (now 'alleles' is 0-element)
        variant = entries.pop(0) # (now 'gene' is 0-element)
        #print(chrompos)
        #print(variant)
        #change format of "chr:pos" to "chr pos" for lookup
        #somehow this next line don't work
        #chrompos.replace(":"," ")
        g = chrompos.split(":");
        chromposspc = g.pop(0)+" "+g.pop(0)
        #print(chrompos)
        pileupLine = grep(chromposspc,pileupFileLines)
        lodLine = grep(chrompos,lodFileLines)
        #print(pileupLine)
        # line is
        # chr pos ref num_A num_C num_G num_T
        if not pileupLine:
            continue
        else:
            pileupLine = pileupLine.pop(0)
            #print(pileupLine)
            pileupList = pileupLine.split(" ")
            ref = pileupList.pop(2) # now num_A is 2
            num_A = int(pileupList.pop(2))
            num_C = int(pileupList.pop(2))
            num_G = int(pileupList.pop(2))
            num_T = int(pileupList.pop(2))
            depth = num_A+num_C+num_G+num_T
            lodLine = lodLine.pop(0)
            lodLine = lodLine.split(" ")
            # print(lodLine)
            lod = max(float(lodLine.pop(21)),float(lodLine.pop(19)))
            # output is
            # chr pos ref depth mapping call btr btnb AA AC AG AT CC CG CT GG GT TT
            outStr = chrompos+" "+ref+" "+str(depth)+" 5 "+variant+" "+str(lod)+" -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12"
            print(outStr)
--- a/python/expandedSummaryToVCF.py
+++ b/python/expandedSummaryToVCF.py
@ -0,0 +1,191 @@
 #!/usr/bin/env python
 #import farm_commands
 import os.path
 import sys
 import re
 import time
 import math
 def grepFile(string, file):
    return grep(string, file.readlines())
 def grep(string, list):
    expr = re.compile(string)
    return [elem for elem in list if expr.match(elem)]
 def dictFromCombinedErrorCoverageFile(f):
    dictList = []
    for line in f:
        ln = line.split()
        key = ln[0]
        item = ln
        dictList.append([key,item])
    return dict(dictList)
 def qualFromLod(L):
    X = math.exp(-L)
    try:
        return math.floor(-10*math.log10(X/1+X))
    except OverflowError:
        return 1000
 callFileStr = sys.argv[1]
 callFile = open(callFileStr)
 pipelineSamplesFile = open(sys.argv[2])
 directory = os.getcwd()
 syzygyPathSamples = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"
 sortByRefPath = "/humgen/gsa-hphome1/chartl/sting/perl/sortByRef.pl"
 poolNames = []
 poolInternalIDs = []
 poolCombinedErrorCoverageCalls = []
 proj = "" # required for an output file
 for line in pipelineSamplesFile:
    ln = line.strip('\n')
    ln = ln.split(";")
    #ln = line.split("\t") # -depends on format
    poolNames.append(ln.pop(2))
    piid = ln.pop(0)
    poolInternalIDs.append(piid)
    proj = ln.pop(0)
    ceccPath = syzygyPathSamples+proj+"/"+piid+"/"+proj+"."+piid+".bam.combined.error.coverage.calls"
    print("reading: "+ceccPath)
    poolCombinedErrorCoverageCalls.append(dictFromCombinedErrorCoverageFile(open(ceccPath)))
 pooledOutputFiles = []
 for pool in poolNames:
    pooledOutputFiles.append(open(directory+"/"+pool+"_calls.vcf",'w'))
 pooledCallsFile = open(directory+"/"+proj+"_combined_calls.vcf",'w')
 header1 = "##format=PCFv1\n"
 header2 = "##filedate="+time.strftime("%Y%m%d")+"\n"
 header3 = "##source=expandedSummaryToVCF:"+callFileStr+"\n"
 header4 = "##reference=Homo_sapiens_assembly18\n"
 header5 = "##phasing=pooled\n"
 header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"+"\t"+"\t".join(poolNames)+"\n" #note: really cool method
 # print header
 pooledCallsFile.write(header1)
 pooledCallsFile.write(header2)
 pooledCallsFile.write(header3)
 pooledCallsFile.write(header4)
 pooledCallsFile.write(header5)
 pooledCallsFile.write(header6)
 # print rest of headers
 for i in range(len(poolNames)):
    header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
    pooledOutputFiles[i].write(header1)
    pooledOutputFiles[i].write(header2)
    pooledOutputFiles[i].write(header3)
    pooledOutputFiles[i].write(header4)
    pooledOutputFiles[i].write(header5)
    pooledOutputFiles[i].write(header6)
 for line in callFile:
 # note: file is a .csv; so comma delimited with headers
 # chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ...
 # ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ...
 # ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools
 # call file assumed already to have been split by pool
 # pileupFile is a .bam.coverage file from the pooled pipeline
 # call file isn't sorted by chr:position 
 # make the header strings
    if line.startswith("chr:pos"):
        continue
    elif not line.startswith("chr"):
        continue
    else:
        #print(line)
        entries = line.split(",")
        chrompos = entries[0]
        alleles = entries[1]
        variant = alleles[1]
        ref = alleles[0]
        dbSNP = entries[4]
        if dbSNP:
            pass
        else:
            dbSNP="."
        supportingPools = entries.pop(62).rstrip(']').lstrip('[')
        supportingPools = supportingPools.split(";")
        total_quality = 0
        total_slod = 0
        total_depth = 0
        quality_by_pool = []
        slod_by_pool = []
        depth_by_pool = []
        #sys.exit()
        for i in range(len(poolNames)):
            #grab line from the correct dict
            depth = 0;
            slod = 0;
            qual = 0;
            try:
                ceccLine = poolCombinedErrorCoverageCalls[i][chrompos]
                depth = ceccLine[18]
                qual=qualFromLod(float(ceccLine[21]))
                if ceccLine[22] == "NA":
                    slod = 0;
                else:
                    try:
                        slod = math.log10(float(ceccLine[23]))
                    except OverflowError:
                        slod = -1000;
            except KeyError:
                # do nothing
                pass
            #print this out to the file
            chromsplit = chrompos.split(":")
            outstr=chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(qual)+"\t0\t"+"DP="+str(depth)+";SB="+str(slod)+"\n"
            pooledOutputFiles[i].write(outstr)
            #now update data
            total_slod = total_slod + float(slod)
            total_depth = total_depth + int(depth)
            total_quality = total_quality + qual
            depth_by_pool.append(depth)
            slod_by_pool.append(slod)
            quality_by_pool.append(qual)
        #now for the pooled file
        chromsplit=chrompos.split(":")
        outstr = chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(total_quality)+"\t0\t"+"DP="+str(total_depth)+";SB="+str(total_slod)+";NP="+str(len(supportingPools))+"\tGT:GQ:DP:SB"
        pooledCallsFile.write(outstr)
        #propagate individual pool information
        for i in range(len(poolNames)):
            phase = "0/0"
            if grep(poolNames[i],supportingPools):
                phase = "0/1"
            else:
                phase = "0/0"
            pooledOut="\t"+phase+":"+str(quality_by_pool[i])+":"+str(depth_by_pool[i])+":"+str(slod_by_pool[i])
            pooledCallsFile.write(pooledOut)
        pooledCallsFile.write("\n")
 ## close all files ##
 pooledCallsFile.close()
 for i in range(len(poolNames)):
    pooledOutputFiles[i].close()
 ## sort the files -- system commands ##
 for pool in poolNames:
    cmd1 = "cat "+directory+"/"+pool+"_calls.vcf | sort -n -k2,2 | perl "+sortByRefPath+" - /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai > "+directory+"/tmp.vcf"
    cmd2 = "cp "+directory+"/tmp.vcf "+directory+"/"+pool+"_calls.vcf"
    os.system(cmd1)
    os.system(cmd2)
 cmd = "cat "+directory+"/"+proj+"_combined_calls.vcf | sort -n -k2,2 | perl "+sortByRefPath+" - /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai > "+directory+"/tmp.vcf"
 os.system(cmd)
 cmd = "cp "+directory+"/tmp.vcf "+directory+"/"+proj+"_combined_calls.vcf"
 os.system(cmd)
 cmd = "rm "+directory+"/tmp.vcf"
 os.system(cmd)