#!/usr/bin/env python import os import sys input_verbose_beds = sys.argv[1].split(",") output_vcf_name = sys.argv[2] vcf_header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] format = "GT:FT" class Indel: def __init__(self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding, consF,consR,refF,refR,consMM,refMM): self.chrom = chrom self.start = start self.stop = stop self.bases = bases self.isDeletion = isDeletion self.isFiltered = isFiltered self.isHet = isHet self.sample = sample self.isCoding = isCoding self.consForward = int(consF) self.consReverse = int(consR) self.refForward = int(refF) self.refReverse = int(refR) self.consMM = float(consMM) self.refMM = float(refMM) def getAnnotations__DO_NOT_USE(self): ## note: log(1/2) ~~ -0.3 forwardLod = -0.3*(self.consForward+self.refForward) + 2*self.consForward reverseLod = -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse totalLod = -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse) strand_score = max(forwardLod-totalLod,reverseLod-totalLod) SB = "SB="+str(strand_score) mismatch = "MMR="+str(self.consMM) ref_mismatch = "refMMR="+str(self.refMM) return [SB,mismatch,ref_mismatch] def getTotalLod(self): return -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse) def getFwdLod(self): return -0.3*(self.consForward+self.refForward) + 2*self.consForward def getRevLod(self): return -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse def getAnnotations(indelList): total_lod = 0.0 total_fwd_lod = 0.0 total_rev_lod = 0.0 avg_mmr = 0.0 avg_ref_mmr = 0.0 for indel in indelList: total_lod += indel.getTotalLod() total_fwd_lod += indel.getFwdLod() total_rev_lod += indel.getRevLod() avg_mmr += indel.consMM avg_ref_mmr += indel.refMM avg_mmr = avg_mmr/len(indelList) avg_ref_mmr = avg_ref_mmr/len(indelList) strand_score = max(total_fwd_lod-total_lod,total_rev_lod-total_lod) SB = "SB="+str(strand_score) mismatch = "MMR="+str(avg_mmr) ref_mismatch = "refMMR="+str(avg_ref_mmr) return [SB,mismatch,ref_mismatch] def getGenotypes(samples,indels,alts): s2i = dict() alts = alts.split(",") genotypes = list() for indel in indels: s2i[indel.sample] = indel for sample in samples: if ( sample in s2i.keys() ): if ( s2i[sample].isDeletion ): bases = "D"+str(s2i[sample].bases) else: bases = "I"+s2i[sample].bases gt_index = str(1+alts.index(bases)) if ( s2i[sample].isHet ): gtstr = "0/"+gt_index else: gtstr = gt_index+"/"+gt_index if ( s2i[sample].isFiltered ): gtstr += ":1" else: gtstr += ":0" else: gtstr = "0/0:0" genotypes.append(gtstr) return genotypes def getAlts(indel_list): alts = list() for indel in indel_list: if ( indel.isDeletion ): if ( not "D"+indel.bases in alts ): alts.append("D"+indel.bases) else: if ( not "I"+indel.bases in alts ): alts.append("I"+indel.bases) alt = ",".join(alts) return alt def fixAlts(alts_in): alts = alts_in.split(",") fixed_alts = list() for alt in alts: if ( alt.startswith("D") ): fixed_alts.append("D"+str(len(alt)-1)) else: fixed_alts.append(alt) return ",".join(fixed_alts) def fixChrom(chrom): if ( chrom == 0 ): return "chrM") if ( chrom == "23" ): return "chrX" if ( chrom == "24" ): return "chrY" return "chr"+chrom def writeVCFLine(out_stream,indel_list,sample_list): alts = getAlts(indel_list) ID = "." ## ignore dbsnp annotation for now chrom = str(indel_list[0].chrom) start = str(indel_list[0].start) if ( indel_list[0].isCoding ): info = "type=Codon;" else: info = "type=Intron;" info += "count="+str(len(indel_list)) # format is global def inSampleOrdering(indel1,indel2): return sample_list.index(indel1.sample)-sample_list.index(indel2.sample) indel_list.sort(inSampleOrdering) fixed_alts = fixAlts(alts) if ( True or not fixed_alts.find(",") > -1 ): info += ";"+";".join(getAnnotations(indel_list)) entries = [fixChrom(chrom),start,ID,"A",fixed_alts,"50","0",info,format] for e in getGenotypes(sample_list,indel_list,alts): entries.append(e) out_stream.write("\t".join(entries)+"\n") def startCompare(ind1,ind2): if ( ind1.chrom != ind2.chrom ): return ind1.chrom - ind2.chrom else: return ind1.start - ind2.start def output(indels,popname,samples): outfile = open(output_vcf_name+"_BAD_REFERENCE.vcf",'w') outfile.write("##VCFv3.2\n##Created by gatherIndelsToVCF.py\n") outfile.write("\t".join(vcf_header)+"\t"+"\t".join(samples)+"\n") indels_for_line = list() for indel in indels: if ( len(indels_for_line) == 0 or startCompare(indel,indels_for_line[len(indels_for_line)-1]) == 0 ): indels_for_line.append(indel) else: writeVCFLine(outfile,indels_for_line,samples) indels_for_line = list() outfile.close() def parseSample(filename): return filename.split(".",1)[0] def parseIndels(filePath,sampleName): f = open(filePath) indels = list() for line in f.readlines(): if ( not line.startswith("[") ): spline = line.split("\t") spline[0] = spline[0].split("chr")[1] if ( spline[0] != "X" and spline[0] != "Y" and spline[0] != "M"): chrom = int(spline[0]) else: if ( spline[0] == "M" ): chrom = 0 elif ( spline[0] == "X"): chrom = 23 else: chrom = 24 start = int(spline[1]) stop = int(spline[2]) rawbase = spline[3] isDeletion = rawbase.startswith("-") if ( isDeletion ): bases = spline[3].split("-")[1] else: bases = spline[3].split("+")[1] sample = sampleName isFiltered = False if ( line.find("AUTOFILTER") > -1 ): if ( line.find("CONS_AV_MM") > -1 ): mm = float(line.split("AV_MM[C/R]:")[1].split("/")[0]) if ( mm >= 3.5 ): isFiltered = True else: isFiltered = True isCoding = line.find("CODING") > -1 isHet = True ## haven't seen a hom yet---todo --- fix this # STRAND_COUNTS[C/C/R/R]: strand_counts_field = line.split("STRAND_COUNTS[C/C/R/R]:")[1].split()[0] strand_counts = strand_counts_field.split("/") consF = strand_counts[0] consR = strand_counts[1] refF = strand_counts[2] refR = strand_counts[3] # AV_MM[C/R]: mismatch_field = line.split("AV_MM[C/R]:")[1].split()[0] consMM = mismatch_field.split("/")[0] refMM = mismatch_field.split("/")[1] # recall order: self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding indels.append(Indel(chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,consF,consR,refF,refR,consMM,refMM)) return indels def writeVCF(verbose_bed_list): for file in verbose_bed_list: samples.append(parseSample(file)) indels.extend(parseIndels(indel_dir+file,parseSample(file))) indels.sort(startCompare) output(indels,popname,samples) writeVCF(input_verbose_beds)