Added - converter from expanded summary to VCF (beautiful thing, really)
Removed - the ugly hackjob that was expanded summary to Geli git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1970 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a545859c62
commit
3d9195f8b6
|
|
@ -1,73 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
import farm_commands
|
|
||||||
import os.path
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def grepFile(string, file):
|
|
||||||
return grep(string, file.readlines())
|
|
||||||
|
|
||||||
def grep(string, list):
|
|
||||||
expr = re.compile(string)
|
|
||||||
return [elem for elem in list if expr.match(elem)]
|
|
||||||
|
|
||||||
callFile = open(sys.argv[1])
|
|
||||||
pileupFileLines = open(sys.argv[2])
|
|
||||||
callFileWithLodScores = open(sys.argv[3])
|
|
||||||
pileupFileLines = pileupFileLines.readlines()
|
|
||||||
lodFileLines = callFileWithLodScores.readlines()
|
|
||||||
|
|
||||||
for line in callFile:
|
|
||||||
# note: file is a .csv; so comma delimited with headers
|
|
||||||
# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ...
|
|
||||||
# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ...
|
|
||||||
# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools
|
|
||||||
|
|
||||||
# call file assumed already to have been split by pool
|
|
||||||
|
|
||||||
# pileupFile is a .bam.coverage file from the pooled pipeline
|
|
||||||
|
|
||||||
# call file isn't sorted by chr:position
|
|
||||||
if line.startswith("chr:pos"):
|
|
||||||
continue
|
|
||||||
elif not line.startswith("chr"):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
#print(line)
|
|
||||||
entries = line.split(",")
|
|
||||||
chrompos = entries.pop(0) # (now 'alleles' is 0-element)
|
|
||||||
variant = entries.pop(0) # (now 'gene' is 0-element)
|
|
||||||
#print(chrompos)
|
|
||||||
#print(variant)
|
|
||||||
#change format of "chr:pos" to "chr pos" for lookup
|
|
||||||
#somehow this next line don't work
|
|
||||||
#chrompos.replace(":"," ")
|
|
||||||
g = chrompos.split(":");
|
|
||||||
chromposspc = g.pop(0)+" "+g.pop(0)
|
|
||||||
#print(chrompos)
|
|
||||||
pileupLine = grep(chromposspc,pileupFileLines)
|
|
||||||
lodLine = grep(chrompos,lodFileLines)
|
|
||||||
#print(pileupLine)
|
|
||||||
# line is
|
|
||||||
# chr pos ref num_A num_C num_G num_T
|
|
||||||
if not pileupLine:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
pileupLine = pileupLine.pop(0)
|
|
||||||
#print(pileupLine)
|
|
||||||
pileupList = pileupLine.split(" ")
|
|
||||||
ref = pileupList.pop(2) # now num_A is 2
|
|
||||||
num_A = int(pileupList.pop(2))
|
|
||||||
num_C = int(pileupList.pop(2))
|
|
||||||
num_G = int(pileupList.pop(2))
|
|
||||||
num_T = int(pileupList.pop(2))
|
|
||||||
depth = num_A+num_C+num_G+num_T
|
|
||||||
lodLine = lodLine.pop(0)
|
|
||||||
lodLine = lodLine.split(" ")
|
|
||||||
# print(lodLine)
|
|
||||||
lod = max(float(lodLine.pop(21)),float(lodLine.pop(19)))
|
|
||||||
# output is
|
|
||||||
# chr pos ref depth mapping call btr btnb AA AC AG AT CC CG CT GG GT TT
|
|
||||||
outStr = chrompos+" "+ref+" "+str(depth)+" 5 "+variant+" "+str(lod)+" -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12"
|
|
||||||
print(outStr)
|
|
||||||
|
|
@ -0,0 +1,191 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
#import farm_commands
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
|
||||||
|
def grepFile(string, file):
|
||||||
|
return grep(string, file.readlines())
|
||||||
|
|
||||||
|
def grep(string, list):
|
||||||
|
expr = re.compile(string)
|
||||||
|
return [elem for elem in list if expr.match(elem)]
|
||||||
|
|
||||||
|
def dictFromCombinedErrorCoverageFile(f):
|
||||||
|
dictList = []
|
||||||
|
for line in f:
|
||||||
|
ln = line.split()
|
||||||
|
key = ln[0]
|
||||||
|
item = ln
|
||||||
|
dictList.append([key,item])
|
||||||
|
return dict(dictList)
|
||||||
|
|
||||||
|
def qualFromLod(L):
|
||||||
|
X = math.exp(-L)
|
||||||
|
try:
|
||||||
|
return math.floor(-10*math.log10(X/1+X))
|
||||||
|
except OverflowError:
|
||||||
|
return 1000
|
||||||
|
|
||||||
|
callFileStr = sys.argv[1]
|
||||||
|
callFile = open(callFileStr)
|
||||||
|
pipelineSamplesFile = open(sys.argv[2])
|
||||||
|
directory = os.getcwd()
|
||||||
|
syzygyPathSamples = "/humgen/gsa-hphome1/flannick/pfizer/pspipeline/output/samples/"
|
||||||
|
sortByRefPath = "/humgen/gsa-hphome1/chartl/sting/perl/sortByRef.pl"
|
||||||
|
|
||||||
|
poolNames = []
|
||||||
|
poolInternalIDs = []
|
||||||
|
poolCombinedErrorCoverageCalls = []
|
||||||
|
proj = "" # required for an output file
|
||||||
|
for line in pipelineSamplesFile:
|
||||||
|
ln = line.strip('\n')
|
||||||
|
ln = ln.split(";")
|
||||||
|
#ln = line.split("\t") # -depends on format
|
||||||
|
poolNames.append(ln.pop(2))
|
||||||
|
piid = ln.pop(0)
|
||||||
|
poolInternalIDs.append(piid)
|
||||||
|
proj = ln.pop(0)
|
||||||
|
ceccPath = syzygyPathSamples+proj+"/"+piid+"/"+proj+"."+piid+".bam.combined.error.coverage.calls"
|
||||||
|
print("reading: "+ceccPath)
|
||||||
|
poolCombinedErrorCoverageCalls.append(dictFromCombinedErrorCoverageFile(open(ceccPath)))
|
||||||
|
|
||||||
|
pooledOutputFiles = []
|
||||||
|
for pool in poolNames:
|
||||||
|
pooledOutputFiles.append(open(directory+"/"+pool+"_calls.vcf",'w'))
|
||||||
|
|
||||||
|
pooledCallsFile = open(directory+"/"+proj+"_combined_calls.vcf",'w')
|
||||||
|
|
||||||
|
header1 = "##format=PCFv1\n"
|
||||||
|
header2 = "##filedate="+time.strftime("%Y%m%d")+"\n"
|
||||||
|
header3 = "##source=expandedSummaryToVCF:"+callFileStr+"\n"
|
||||||
|
header4 = "##reference=Homo_sapiens_assembly18\n"
|
||||||
|
header5 = "##phasing=pooled\n"
|
||||||
|
header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"+"\t"+"\t".join(poolNames)+"\n" #note: really cool method
|
||||||
|
|
||||||
|
# print header
|
||||||
|
pooledCallsFile.write(header1)
|
||||||
|
pooledCallsFile.write(header2)
|
||||||
|
pooledCallsFile.write(header3)
|
||||||
|
pooledCallsFile.write(header4)
|
||||||
|
pooledCallsFile.write(header5)
|
||||||
|
pooledCallsFile.write(header6)
|
||||||
|
|
||||||
|
# print rest of headers
|
||||||
|
|
||||||
|
for i in range(len(poolNames)):
|
||||||
|
header6 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
|
||||||
|
pooledOutputFiles[i].write(header1)
|
||||||
|
pooledOutputFiles[i].write(header2)
|
||||||
|
pooledOutputFiles[i].write(header3)
|
||||||
|
pooledOutputFiles[i].write(header4)
|
||||||
|
pooledOutputFiles[i].write(header5)
|
||||||
|
pooledOutputFiles[i].write(header6)
|
||||||
|
|
||||||
|
for line in callFile:
|
||||||
|
# note: file is a .csv; so comma delimited with headers
|
||||||
|
# chr:position,alleles,gene,type,rs#,base_chg,annot,dist,pop_nr_freq,f_nr_case,f_nr_con,chisqstat_assoc,lrt_stat,f+,f-,SLOD,p_val,min_snp_dist,min_amp_dist,num_sig_pools,mean_worst_dir_nonref, ...
|
||||||
|
# ..., max_worst_dir_nonref,mean_diff_dir_nonref,max_diff_dir_nonref,mean_other/minor,max_other/minor,mean_frac_non_concord,max_frac_non_concord,mean_combined_lod,max_combined_lod,mean_cov_tot, ...
|
||||||
|
# ..., max_cov_tot,mean_cov_diff,max_cov_diff,mean_lod,max_lod,mean_lod_diff,max_lod_diff,mean_lod_diff_norm,max_lod_diff_norm,target_name,sig_pools
|
||||||
|
|
||||||
|
# call file assumed already to have been split by pool
|
||||||
|
|
||||||
|
# pileupFile is a .bam.coverage file from the pooled pipeline
|
||||||
|
|
||||||
|
# call file isn't sorted by chr:position
|
||||||
|
# make the header strings
|
||||||
|
|
||||||
|
if line.startswith("chr:pos"):
|
||||||
|
continue
|
||||||
|
elif not line.startswith("chr"):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
#print(line)
|
||||||
|
entries = line.split(",")
|
||||||
|
chrompos = entries[0]
|
||||||
|
alleles = entries[1]
|
||||||
|
variant = alleles[1]
|
||||||
|
ref = alleles[0]
|
||||||
|
dbSNP = entries[4]
|
||||||
|
if dbSNP:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
dbSNP="."
|
||||||
|
|
||||||
|
supportingPools = entries.pop(62).rstrip(']').lstrip('[')
|
||||||
|
supportingPools = supportingPools.split(";")
|
||||||
|
total_quality = 0
|
||||||
|
total_slod = 0
|
||||||
|
total_depth = 0
|
||||||
|
quality_by_pool = []
|
||||||
|
slod_by_pool = []
|
||||||
|
depth_by_pool = []
|
||||||
|
#sys.exit()
|
||||||
|
for i in range(len(poolNames)):
|
||||||
|
#grab line from the correct dict
|
||||||
|
depth = 0;
|
||||||
|
slod = 0;
|
||||||
|
qual = 0;
|
||||||
|
try:
|
||||||
|
ceccLine = poolCombinedErrorCoverageCalls[i][chrompos]
|
||||||
|
depth = ceccLine[18]
|
||||||
|
qual=qualFromLod(float(ceccLine[21]))
|
||||||
|
if ceccLine[22] == "NA":
|
||||||
|
slod = 0;
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
slod = math.log10(float(ceccLine[23]))
|
||||||
|
except OverflowError:
|
||||||
|
slod = -1000;
|
||||||
|
except KeyError:
|
||||||
|
# do nothing
|
||||||
|
pass
|
||||||
|
#print this out to the file
|
||||||
|
chromsplit = chrompos.split(":")
|
||||||
|
outstr=chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(qual)+"\t0\t"+"DP="+str(depth)+";SB="+str(slod)+"\n"
|
||||||
|
pooledOutputFiles[i].write(outstr)
|
||||||
|
#now update data
|
||||||
|
total_slod = total_slod + float(slod)
|
||||||
|
total_depth = total_depth + int(depth)
|
||||||
|
total_quality = total_quality + qual
|
||||||
|
depth_by_pool.append(depth)
|
||||||
|
slod_by_pool.append(slod)
|
||||||
|
quality_by_pool.append(qual)
|
||||||
|
#now for the pooled file
|
||||||
|
chromsplit=chrompos.split(":")
|
||||||
|
outstr = chromsplit[0]+"\t"+chromsplit[1]+"\t"+dbSNP+"\t"+ref+"\t"+variant+"\t"+str(total_quality)+"\t0\t"+"DP="+str(total_depth)+";SB="+str(total_slod)+";NP="+str(len(supportingPools))+"\tGT:GQ:DP:SB"
|
||||||
|
pooledCallsFile.write(outstr)
|
||||||
|
#propagate individual pool information
|
||||||
|
for i in range(len(poolNames)):
|
||||||
|
phase = "0/0"
|
||||||
|
if grep(poolNames[i],supportingPools):
|
||||||
|
phase = "0/1"
|
||||||
|
else:
|
||||||
|
phase = "0/0"
|
||||||
|
pooledOut="\t"+phase+":"+str(quality_by_pool[i])+":"+str(depth_by_pool[i])+":"+str(slod_by_pool[i])
|
||||||
|
pooledCallsFile.write(pooledOut)
|
||||||
|
pooledCallsFile.write("\n")
|
||||||
|
|
||||||
|
## close all files ##
|
||||||
|
|
||||||
|
pooledCallsFile.close()
|
||||||
|
|
||||||
|
for i in range(len(poolNames)):
|
||||||
|
pooledOutputFiles[i].close()
|
||||||
|
|
||||||
|
## sort the files -- system commands ##
|
||||||
|
|
||||||
|
for pool in poolNames:
|
||||||
|
cmd1 = "cat "+directory+"/"+pool+"_calls.vcf | sort -n -k2,2 | perl "+sortByRefPath+" - /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai > "+directory+"/tmp.vcf"
|
||||||
|
cmd2 = "cp "+directory+"/tmp.vcf "+directory+"/"+pool+"_calls.vcf"
|
||||||
|
os.system(cmd1)
|
||||||
|
os.system(cmd2)
|
||||||
|
|
||||||
|
cmd = "cat "+directory+"/"+proj+"_combined_calls.vcf | sort -n -k2,2 | perl "+sortByRefPath+" - /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai > "+directory+"/tmp.vcf"
|
||||||
|
os.system(cmd)
|
||||||
|
cmd = "cp "+directory+"/tmp.vcf "+directory+"/"+proj+"_combined_calls.vcf"
|
||||||
|
os.system(cmd)
|
||||||
|
cmd = "rm "+directory+"/tmp.vcf"
|
||||||
|
os.system(cmd)
|
||||||
Loading…
Reference in New Issue