From 3d9195f8b6c91f35f284b7457fb7ebc190fdb20d Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 3 Nov 2009 22:20:47 +0000 Subject: [PATCH] Added - converter from expanded summary to VCF (beautiful thing, really) Removed - the ugly hackjob that was expanded summary to Geli git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1970 348d0f76-0448-11de-a6fe-93d51630548a --- python/expandedSummaryToGeli.py | 73 ------------ python/expandedSummaryToVCF.py | 191 ++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 73 deletions(-) delete mode 100755 python/expandedSummaryToGeli.py create mode 100755 python/expandedSummaryToVCF.py diff --git a/python/expandedSummaryToGeli.py b/python/expandedSummaryToGeli.py deleted file mode 100755 index 92a03ab99..000000000 --- a/python/expandedSummaryToGeli.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -import farm_commands -import os.path -import sys -import re - - -def grepFile(string, file): - return grep(string, file.readlines()) - -def grep(string, list): - expr = re.compile(string) - return [elem for elem in list if expr.match(elem)] - -callFile = open(sys.argv[1]) -pileupFileLines = open(sys.argv[2]) -callFileWithLodScores = open(sys.argv[3]) -pileupFileLines = pileupFileLines.readlines() -lodFileLines = callFileWithLodScores.readlines() - -for line in callFile: -# note: file is a .csv; so comma delimited with headers -# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ... -# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ... -# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools - -# call file assumed already to have been split by pool - -# pileupFile is a .bam.coverage file from the pooled pipeline - -# call file isn't sorted by chr:position - if line.startswith("chr:pos"): - continue - elif not line.startswith("chr"): - continue - else: - #print(line) - entries = line.split(",") - chrompos = entries.pop(0) # (now 'alleles' is 0-element) - variant = entries.pop(0) # (now 'gene' is 0-element) - #print(chrompos) - #print(variant) - #change format of "chr:pos" to "chr pos" for lookup - #somehow this next line don't work - #chrompos.replace(":"," ") - g = chrompos.split(":"); - chromposspc = g.pop(0)+" "+g.pop(0) - #print(chrompos) - pileupLine = grep(chromposspc,pileupFileLines) - lodLine = grep(chrompos,lodFileLines) - #print(pileupLine) - # line is - # chr pos ref num_A num_C num_G num_T - if not pileupLine: - continue - else: - pileupLine = pileupLine.pop(0) - #print(pileupLine) - pileupList = pileupLine.split(" ") - ref = pileupList.pop(2) # now num_A is 2 - num_A = int(pileupList.pop(2)) - num_C = int(pileupList.pop(2)) - num_G = int(pileupList.pop(2)) - num_T = int(pileupList.pop(2)) - depth = num_A+num_C+num_G+num_T - lodLine = lodLine.pop(0) - lodLine = lodLine.split(" ") - # print(lodLine) - lod = max(float(lodLine.pop(21)),float(lodLine.pop(19))) - # output is - # chr pos ref depth mapping call btr btnb AA AC AG AT CC CG CT GG GT TT - outStr = chrompos+" "+ref+" "+str(depth)+" 5 "+variant+" "+str(lod)+" -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12" - print(outStr) diff --git a/python/expandedSummaryToVCF.py b/python/expandedSummaryToVCF.py new file mode 100755 index 000000000..37fc65fbf --- /dev/null +++ b/python/expandedSummaryToVCF.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +#import farm_commands +import os.path +import sys +import re +import time +import math + +def grepFile(string, file): + return grep(string, file.readlines()) + +def grep(string, list): + expr = re.compile(string) + return [elem for elem in list if expr.match(elem)] + +def dictFromCombinedErrorCoverageFile(f): + dictList = [] + for line in f: + ln = line.split() + key = ln[0] + item = ln + dictList.append([key,item]) + return dict(dictList) + +def qualFromLod(L): + X = math.exp(-L) + try: + return math.floor(-10*math.log10(X/1+X)) + except OverflowError: + return 1000 + +callFileStr = sys.argv[1] +callFile = open(callFileStr) +pipelineSamplesFile = open(sys.argv[2]) +directory = os.getcwd() +syzygyPathSamples = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/" +sortByRefPath = "/humgen/gsa-hphome1/chartl/sting/perl/sortByRef.pl" + +poolNames = [] +poolInternalIDs = [] +poolCombinedErrorCoverageCalls = [] +proj = "" # required for an output file +for line in pipelineSamplesFile: + ln = line.strip('\n') + ln = ln.split(";") + #ln = line.split("\t") # -depends on format + poolNames.append(ln.pop(2)) + piid = ln.pop(0) + poolInternalIDs.append(piid) + proj = ln.pop(0) + ceccPath = syzygyPathSamples+proj+"/"+piid+"/"+proj+"."+piid+".bam.combined.error.coverage.calls" + print("reading: "+ceccPath) + poolCombinedErrorCoverageCalls.append(dictFromCombinedErrorCoverageFile(open(ceccPath))) + +pooledOutputFiles = [] +for pool in poolNames: + pooledOutputFiles.append(open(directory+"/"+pool+"_calls.vcf",'w')) + +pooledCallsFile = open(directory+"/"+proj+"_combined_calls.vcf",'w') + +header1 = "##format=PCFv1\n" +header2 = "##filedate="+time.strftime("%Y%m%d")+"\n" +header3 = "##source=expandedSummaryToVCF:"+callFileStr+"\n" +header4 = "##reference=Homo_sapiens_assembly18\n" +header5 = "##phasing=pooled\n" +header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"+"\t"+"\t".join(poolNames)+"\n" #note: really cool method + +# print header +pooledCallsFile.write(header1) +pooledCallsFile.write(header2) +pooledCallsFile.write(header3) +pooledCallsFile.write(header4) +pooledCallsFile.write(header5) +pooledCallsFile.write(header6) + +# print rest of headers + +for i in range(len(poolNames)): + header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" + pooledOutputFiles[i].write(header1) + pooledOutputFiles[i].write(header2) + pooledOutputFiles[i].write(header3) + pooledOutputFiles[i].write(header4) + pooledOutputFiles[i].write(header5) + pooledOutputFiles[i].write(header6) + +for line in callFile: +# note: file is a .csv; so comma delimited with headers +# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ... +# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ... +# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools + +# call file assumed already to have been split by pool + +# pileupFile is a .bam.coverage file from the pooled pipeline + +# call file isn't sorted by chr:position +# make the header strings + + if line.startswith("chr:pos"): + continue + elif not line.startswith("chr"): + continue + else: + #print(line) + entries = line.split(",") + chrompos = entries[0] + alleles = entries[1] + variant = alleles[1] + ref = alleles[0] + dbSNP = entries[4] + if dbSNP: + pass + else: + dbSNP="." + + supportingPools = entries.pop(62).rstrip(']').lstrip('[') + supportingPools = supportingPools.split(";") + total_quality = 0 + total_slod = 0 + total_depth = 0 + quality_by_pool = [] + slod_by_pool = [] + depth_by_pool = [] + #sys.exit() + for i in range(len(poolNames)): + #grab line from the correct dict + depth = 0; + slod = 0; + qual = 0; + try: + ceccLine = poolCombinedErrorCoverageCalls[i][chrompos] + depth = ceccLine[18] + qual=qualFromLod(float(ceccLine[21])) + if ceccLine[22] == "NA": + slod = 0; + else: + try: + slod = math.log10(float(ceccLine[23])) + except OverflowError: + slod = -1000; + except KeyError: + # do nothing + pass + #print this out to the file + chromsplit = chrompos.split(":") + outstr=chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(qual)+"\t0\t"+"DP="+str(depth)+";SB="+str(slod)+"\n" + pooledOutputFiles[i].write(outstr) + #now update data + total_slod = total_slod + float(slod) + total_depth = total_depth + int(depth) + total_quality = total_quality + qual + depth_by_pool.append(depth) + slod_by_pool.append(slod) + quality_by_pool.append(qual) + #now for the pooled file + chromsplit=chrompos.split(":") + outstr = chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(total_quality)+"\t0\t"+"DP="+str(total_depth)+";SB="+str(total_slod)+";NP="+str(len(supportingPools))+"\tGT:GQ:DP:SB" + pooledCallsFile.write(outstr) + #propagate individual pool information + for i in range(len(poolNames)): + phase = "0/0" + if grep(poolNames[i],supportingPools): + phase = "0/1" + else: + phase = "0/0" + pooledOut="\t"+phase+":"+str(quality_by_pool[i])+":"+str(depth_by_pool[i])+":"+str(slod_by_pool[i]) + pooledCallsFile.write(pooledOut) + pooledCallsFile.write("\n") + +## close all files ## + +pooledCallsFile.close() + +for i in range(len(poolNames)): + pooledOutputFiles[i].close() + +## sort the files -- system commands ## + +for pool in poolNames: + cmd1 = "cat "+directory+"/"+pool+"_calls.vcf | sort -n -k2,2 | perl "+sortByRefPath+" - /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai > "+directory+"/tmp.vcf" + cmd2 = "cp "+directory+"/tmp.vcf "+directory+"/"+pool+"_calls.vcf" + os.system(cmd1) + os.system(cmd2) + +cmd = "cat "+directory+"/"+proj+"_combined_calls.vcf | sort -n -k2,2 | perl "+sortByRefPath+" - /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai > "+directory+"/tmp.vcf" +os.system(cmd) +cmd = "cp "+directory+"/tmp.vcf "+directory+"/"+proj+"_combined_calls.vcf" +os.system(cmd) +cmd = "rm "+directory+"/tmp.vcf" +os.system(cmd)