232 lines
8.0 KiB
Python
Executable File
232 lines
8.0 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
import os
|
|
import sys
|
|
|
|
input_verbose_beds = sys.argv[1].split(",")
|
|
output_vcf_name = sys.argv[2]
|
|
|
|
vcf_header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
|
|
format = "GT:FT"
|
|
class Indel:
|
|
def __init__(self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,
|
|
consF,consR,refF,refR,consMM,refMM):
|
|
self.chrom = chrom
|
|
self.start = start
|
|
self.stop = stop
|
|
self.bases = bases
|
|
self.isDeletion = isDeletion
|
|
self.isFiltered = isFiltered
|
|
self.isHet = isHet
|
|
self.sample = sample
|
|
self.isCoding = isCoding
|
|
self.consForward = int(consF)
|
|
self.consReverse = int(consR)
|
|
self.refForward = int(refF)
|
|
self.refReverse = int(refR)
|
|
self.consMM = float(consMM)
|
|
self.refMM = float(refMM)
|
|
|
|
def getAnnotations__DO_NOT_USE(self):
|
|
## note: log(1/2) ~~ -0.3
|
|
forwardLod = -0.3*(self.consForward+self.refForward) + 2*self.consForward
|
|
reverseLod = -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse
|
|
totalLod = -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse)
|
|
strand_score = max(forwardLod-totalLod,reverseLod-totalLod)
|
|
SB = "SB="+str(strand_score)
|
|
mismatch = "MMR="+str(self.consMM)
|
|
ref_mismatch = "refMMR="+str(self.refMM)
|
|
return [SB,mismatch,ref_mismatch]
|
|
|
|
def getTotalLod(self):
|
|
return -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse)
|
|
|
|
def getFwdLod(self):
|
|
return -0.3*(self.consForward+self.refForward) + 2*self.consForward
|
|
|
|
def getRevLod(self):
|
|
return -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse
|
|
|
|
def getAnnotations(indelList):
|
|
total_lod = 0.0
|
|
total_fwd_lod = 0.0
|
|
total_rev_lod = 0.0
|
|
avg_mmr = 0.0
|
|
avg_ref_mmr = 0.0
|
|
for indel in indelList:
|
|
total_lod += indel.getTotalLod()
|
|
total_fwd_lod += indel.getFwdLod()
|
|
total_rev_lod += indel.getRevLod()
|
|
avg_mmr += indel.consMM
|
|
avg_ref_mmr += indel.refMM
|
|
avg_mmr = avg_mmr/len(indelList)
|
|
avg_ref_mmr = avg_ref_mmr/len(indelList)
|
|
strand_score = max(total_fwd_lod-total_lod,total_rev_lod-total_lod)
|
|
SB = "SB="+str(strand_score)
|
|
mismatch = "MMR="+str(avg_mmr)
|
|
ref_mismatch = "refMMR="+str(avg_ref_mmr)
|
|
return [SB,mismatch,ref_mismatch]
|
|
|
|
def getGenotypes(samples,indels,alts):
|
|
s2i = dict()
|
|
alts = alts.split(",")
|
|
genotypes = list()
|
|
for indel in indels:
|
|
s2i[indel.sample] = indel
|
|
for sample in samples:
|
|
if ( sample in s2i.keys() ):
|
|
if ( s2i[sample].isDeletion ):
|
|
bases = "D"+str(s2i[sample].bases)
|
|
else:
|
|
bases = "I"+s2i[sample].bases
|
|
gt_index = str(1+alts.index(bases))
|
|
if ( s2i[sample].isHet ):
|
|
gtstr = "0/"+gt_index
|
|
else:
|
|
gtstr = gt_index+"/"+gt_index
|
|
if ( s2i[sample].isFiltered ):
|
|
gtstr += ":1"
|
|
else:
|
|
gtstr += ":0"
|
|
else:
|
|
gtstr = "0/0:0"
|
|
genotypes.append(gtstr)
|
|
return genotypes
|
|
|
|
def getAlts(indel_list):
|
|
alts = list()
|
|
for indel in indel_list:
|
|
if ( indel.isDeletion ):
|
|
if ( not "D"+indel.bases in alts ):
|
|
alts.append("D"+indel.bases)
|
|
else:
|
|
if ( not "I"+indel.bases in alts ):
|
|
alts.append("I"+indel.bases)
|
|
|
|
alt = ",".join(alts)
|
|
|
|
return alt
|
|
|
|
def fixAlts(alts_in):
|
|
alts = alts_in.split(",")
|
|
fixed_alts = list()
|
|
for alt in alts:
|
|
if ( alt.startswith("D") ):
|
|
fixed_alts.append("D"+str(len(alt)-1))
|
|
else:
|
|
fixed_alts.append(alt)
|
|
return ",".join(fixed_alts)
|
|
|
|
def fixChrom(chrom):
|
|
if ( chrom == 0 ):
|
|
return "chrM")
|
|
if ( chrom == "23" ):
|
|
return "chrX"
|
|
if ( chrom == "24" ):
|
|
return "chrY"
|
|
return "chr"+chrom
|
|
|
|
def writeVCFLine(out_stream,indel_list,sample_list):
|
|
alts = getAlts(indel_list)
|
|
ID = "." ## ignore dbsnp annotation for now
|
|
chrom = str(indel_list[0].chrom)
|
|
start = str(indel_list[0].start)
|
|
if ( indel_list[0].isCoding ):
|
|
info = "type=Codon;"
|
|
else:
|
|
info = "type=Intron;"
|
|
info += "count="+str(len(indel_list))
|
|
# format is global
|
|
def inSampleOrdering(indel1,indel2):
|
|
return sample_list.index(indel1.sample)-sample_list.index(indel2.sample)
|
|
indel_list.sort(inSampleOrdering)
|
|
fixed_alts = fixAlts(alts)
|
|
if ( True or not fixed_alts.find(",") > -1 ):
|
|
info += ";"+";".join(getAnnotations(indel_list))
|
|
entries = [fixChrom(chrom),start,ID,"A",fixed_alts,"50","0",info,format]
|
|
for e in getGenotypes(sample_list,indel_list,alts):
|
|
entries.append(e)
|
|
out_stream.write("\t".join(entries)+"\n")
|
|
|
|
def startCompare(ind1,ind2):
|
|
if ( ind1.chrom != ind2.chrom ):
|
|
return ind1.chrom - ind2.chrom
|
|
else:
|
|
return ind1.start - ind2.start
|
|
|
|
def output(indels,popname,samples):
|
|
outfile = open(output_vcf_name+"_BAD_REFERENCE.vcf",'w')
|
|
outfile.write("##VCFv3.2\n##Created by gatherIndelsToVCF.py\n")
|
|
outfile.write("\t".join(vcf_header)+"\t"+"\t".join(samples)+"\n")
|
|
indels_for_line = list()
|
|
for indel in indels:
|
|
if ( len(indels_for_line) == 0 or startCompare(indel,indels_for_line[len(indels_for_line)-1]) == 0 ):
|
|
indels_for_line.append(indel)
|
|
else:
|
|
writeVCFLine(outfile,indels_for_line,samples)
|
|
indels_for_line = list()
|
|
outfile.close()
|
|
|
|
def parseSample(filename):
|
|
return filename.split(".",1)[0]
|
|
|
|
def parseIndels(filePath,sampleName):
|
|
f = open(filePath)
|
|
indels = list()
|
|
for line in f.readlines():
|
|
if ( not line.startswith("[") ):
|
|
spline = line.split("\t")
|
|
spline[0] = spline[0].split("chr")[1]
|
|
if ( spline[0] != "X" and spline[0] != "Y" and spline[0] != "M"):
|
|
chrom = int(spline[0])
|
|
else:
|
|
if ( spline[0] == "M" ):
|
|
chrom = 0
|
|
elif ( spline[0] == "X"):
|
|
chrom = 23
|
|
else:
|
|
chrom = 24
|
|
start = int(spline[1])
|
|
stop = int(spline[2])
|
|
rawbase = spline[3]
|
|
isDeletion = rawbase.startswith("-")
|
|
if ( isDeletion ):
|
|
bases = spline[3].split("-")[1]
|
|
else:
|
|
bases = spline[3].split("+")[1]
|
|
sample = sampleName
|
|
isFiltered = False
|
|
if ( line.find("AUTOFILTER") > -1 ):
|
|
if ( line.find("CONS_AV_MM") > -1 ):
|
|
mm = float(line.split("AV_MM[C/R]:")[1].split("/")[0])
|
|
if ( mm >= 3.5 ):
|
|
isFiltered = True
|
|
else:
|
|
isFiltered = True
|
|
isCoding = line.find("CODING") > -1
|
|
isHet = True ## haven't seen a hom yet---todo --- fix this
|
|
# STRAND_COUNTS[C/C/R/R]:
|
|
strand_counts_field = line.split("STRAND_COUNTS[C/C/R/R]:")[1].split()[0]
|
|
strand_counts = strand_counts_field.split("/")
|
|
consF = strand_counts[0]
|
|
consR = strand_counts[1]
|
|
refF = strand_counts[2]
|
|
refR = strand_counts[3]
|
|
# AV_MM[C/R]:
|
|
mismatch_field = line.split("AV_MM[C/R]:")[1].split()[0]
|
|
consMM = mismatch_field.split("/")[0]
|
|
refMM = mismatch_field.split("/")[1]
|
|
# recall order: self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding
|
|
indels.append(Indel(chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,consF,consR,refF,refR,consMM,refMM))
|
|
return indels
|
|
|
|
def writeVCF(verbose_bed_list):
|
|
for file in verbose_bed_list:
|
|
samples.append(parseSample(file))
|
|
indels.extend(parseIndels(indel_dir+file,parseSample(file)))
|
|
indels.sort(startCompare)
|
|
output(indels,popname,samples)
|
|
|
|
writeVCF(input_verbose_beds)
|
|
|