diff --git a/python/gatherIndelsToVCF.py b/python/gatherIndelsToVCF.py new file mode 100755 index 000000000..3a2a45a19 --- /dev/null +++ b/python/gatherIndelsToVCF.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +import os +import sys + +input_verbose_beds = sys.argv[1].split(",") +output_vcf_name = sys.argv[2] +reference_file = sys.argv[3] + +vcf_header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] +format = "GT:FT" +class Indel: + def __init__(self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding, + consF,consR,refF,refR,consMM,refMM): + self.chrom = chrom + self.start = start + self.stop = stop + self.bases = bases + self.isDeletion = isDeletion + self.isFiltered = isFiltered + self.isHet = isHet + self.sample = sample + self.isCoding = isCoding + self.consForward = int(consF) + self.consReverse = int(consR) + self.refForward = int(refF) + self.refReverse = int(refR) + self.consMM = float(consMM) + self.refMM = float(refMM) + + def getAnnotations__DO_NOT_USE(self): + ## note: log(1/2) ~~ -0.3 + forwardLod = -0.3*(self.consForward+self.refForward) + 2*self.consForward + reverseLod = -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse + totalLod = -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse) + strand_score = max(forwardLod-totalLod,reverseLod-totalLod) + SB = "SB="+str(strand_score) + mismatch = "MMR="+str(self.consMM) + ref_mismatch = "refMMR="+str(self.refMM) + return [SB,mismatch,ref_mismatch] + + def getTotalLod(self): + return -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse) + + def getFwdLod(self): + return -0.3*(self.consForward+self.refForward) + 2*self.consForward + + def getRevLod(self): + return -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse + +def getAnnotations(indelList): + total_lod = 0.0 + total_fwd_lod = 0.0 + total_rev_lod = 0.0 + avg_mmr = 0.0 + avg_ref_mmr = 0.0 + for indel in indelList: + total_lod += indel.getTotalLod() + total_fwd_lod += indel.getFwdLod() + total_rev_lod += indel.getRevLod() + avg_mmr += indel.consMM + avg_ref_mmr += indel.refMM + avg_mmr = avg_mmr/len(indelList) + avg_ref_mmr = avg_ref_mmr/len(indelList) + strand_score = max(total_fwd_lod-total_lod,total_rev_lod-total_lod) + SB = "SB="+str(strand_score) + mismatch = "MMR="+str(avg_mmr) + ref_mismatch = "refMMR="+str(avg_ref_mmr) + return [SB,mismatch,ref_mismatch] + +def getGenotypes(samples,indels,alts): + s2i = dict() + alts = alts.split(",") + genotypes = list() + for indel in indels: + s2i[indel.sample] = indel + for sample in samples: + if ( sample in s2i.keys() ): + if ( s2i[sample].isDeletion ): + bases = "D"+str(s2i[sample].bases) + else: + bases = "I"+s2i[sample].bases + gt_index = str(1+alts.index(bases)) + if ( s2i[sample].isHet ): + gtstr = "0/"+gt_index + else: + gtstr = gt_index+"/"+gt_index + if ( s2i[sample].isFiltered ): + gtstr += ":1" + else: + gtstr += ":0" + else: + gtstr = "0/0:0" + genotypes.append(gtstr) + return genotypes + +def getAlts(indel_list): + alts = list() + for indel in indel_list: + if ( indel.isDeletion ): + if ( not "D"+indel.bases in alts ): + alts.append("D"+indel.bases) + else: + if ( not "I"+indel.bases in alts ): + alts.append("I"+indel.bases) + + alt = ",".join(alts) + + return alt + +def fixAlts(alts_in): + alts = alts_in.split(",") + fixed_alts = list() + for alt in alts: + if ( alt.startswith("D") ): + fixed_alts.append("D"+str(len(alt)-1)) + else: + fixed_alts.append(alt) + return ",".join(fixed_alts) + +def fixChrom(chrom): + if ( chrom == "23" ): + return "X" + if ( chrom == "24" ): + return "Y" + return chrom + +def writeVCFLine(out_stream,indel_list,sample_list): + alts = getAlts(indel_list) + ID = "." ## ignore dbsnp annotation for now + chrom = str(indel_list[0].chrom) + start = str(indel_list[0].start) + if ( indel_list[0].isCoding ): + info = "type=Codon;" + else: + info = "type=Intron;" + info += "count="+str(len(indel_list)) + # format is global + def inSampleOrdering(indel1,indel2): + return sample_list.index(indel1.sample)-sample_list.index(indel2.sample) + indel_list.sort(inSampleOrdering) + fixed_alts = fixAlts(alts) + if ( True or not fixed_alts.find(",") > -1 ): + info += ";"+";".join(getAnnotations(indel_list)) + entries = [fixChrom(chrom),start,ID,"A",fixed_alts,"50","0",info,format] + for e in getGenotypes(sample_list,indel_list,alts): + entries.append(e) + out_stream.write("\t".join(entries)+"\n") + +def startCompare(ind1,ind2): + if ( ind1.chrom != ind2.chrom ): + return ind1.chrom - ind2.chrom + else: + return ind1.start - ind2.start + +def output(indels,popname,samples): + outfile = open(output_vcf_name+"_BAD_REFERENCE.vcf",'w') + outfile.write("##VCFv3.2\n##Created by gatherIndelsToVCF.py\n") + outfile.write("\t".join(vcf_header)+"\t"+"\t".join(samples)+"\n") + indels_for_line = list() + for indel in indels: + if ( len(indels_for_line) == 0 or startCompare(indel,indels_for_line[len(indels_for_line)-1]) == 0 ): + indels_for_line.append(indel) + else: + writeVCFLine(outfile,indels_for_line,samples) + indels_for_line = list() + outfile.close() + +def parseSample(filename): + return filename.split(".",1)[0] + +def parseIndels(filePath,sampleName): + f = open(filePath) + indels = list() + for line in f.readlines(): + if ( not line.startswith("[") ): + spline = line.split("\t") + if ( spline[0] != "X" and spline[0] != "Y" ): + chrom = int(spline[0]) + else: + if ( spline[0] == "X"): + chrom = 23 + else: + chrom = 24 + start = int(spline[1]) + stop = int(spline[2]) + rawbase = spline[3] + isDeletion = rawbase.startswith("-") + if ( isDeletion ): + bases = spline[3].split("-")[1] + else: + bases = spline[3].split("+")[1] + sample = sampleName + isFiltered = False + if ( line.find("AUTOFILTER") > -1 ): + if ( line.find("CONS_AV_MM") > -1 ): + mm = float(line.split("AV_MM[C/R]:")[1].split("/")[0]) + if ( mm >= 3.5 ): + isFiltered = True + else: + isFiltered = True + isCoding = line.find("CODING") > -1 + isHet = True ## haven't seen a hom yet---todo --- fix this + # STRAND_COUNTS[C/C/R/R]: + strand_counts_field = line.split("STRAND_COUNTS[C/C/R/R]:")[1].split()[0] + strand_counts = strand_counts_field.split("/") + consF = strand_counts[0] + consR = strand_counts[1] + refF = strand_counts[2] + refR = strand_counts[3] + # AV_MM[C/R]: + mismatch_field = line.split("AV_MM[C/R]:")[1].split()[0] + consMM = mismatch_field.split("/")[0] + refMM = mismatch_field.split("/")[1] + # recall order: self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding + indels.append(Indel(chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,consF,consR,refF,refR,consMM,refMM)) + return indels + +def writeVCF(verbose_bed_list): + for file in verbose_bed_list: + samples.append(parseSample(file)) + indels.extend(parseIndels(indel_dir+file,parseSample(file))) + indels.sort(startCompare) + output(indels,popname,samples) + +writeVCF(input_verbose_beds) + +## now dumb hacky thing + +cmd = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -T VCFReferenceFixer -B fixme,VCF,"+output_vcf_name+"_BAD_REFERENCE.vcf -o "+output_vcf_name+" -R "+reference_file+" -l INFO" +os.system(cmd)