gatk-3.8/python/gatherIndelsToVCF.py

232 lines
8.1 KiB
Python
Executable File

#!/usr/bin/env python
import os
import sys
input_verbose_beds = sys.argv[1].split(",")
output_vcf_name = sys.argv[2]
reference_file = sys.argv[3]
vcf_header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
format = "GT:FT"
class Indel:
def __init__(self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,
consF,consR,refF,refR,consMM,refMM):
self.chrom = chrom
self.start = start
self.stop = stop
self.bases = bases
self.isDeletion = isDeletion
self.isFiltered = isFiltered
self.isHet = isHet
self.sample = sample
self.isCoding = isCoding
self.consForward = int(consF)
self.consReverse = int(consR)
self.refForward = int(refF)
self.refReverse = int(refR)
self.consMM = float(consMM)
self.refMM = float(refMM)
def getAnnotations__DO_NOT_USE(self):
## note: log(1/2) ~~ -0.3
forwardLod = -0.3*(self.consForward+self.refForward) + 2*self.consForward
reverseLod = -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse
totalLod = -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse)
strand_score = max(forwardLod-totalLod,reverseLod-totalLod)
SB = "SB="+str(strand_score)
mismatch = "MMR="+str(self.consMM)
ref_mismatch = "refMMR="+str(self.refMM)
return [SB,mismatch,ref_mismatch]
def getTotalLod(self):
return -0.3*(self.consForward+self.consReverse+self.refForward+self.refReverse) + 2*(self.consForward+self.consReverse)
def getFwdLod(self):
return -0.3*(self.consForward+self.refForward) + 2*self.consForward
def getRevLod(self):
return -0.3*(self.consReverse+self.refReverse) + 2*self.consReverse
def getAnnotations(indelList):
total_lod = 0.0
total_fwd_lod = 0.0
total_rev_lod = 0.0
avg_mmr = 0.0
avg_ref_mmr = 0.0
for indel in indelList:
total_lod += indel.getTotalLod()
total_fwd_lod += indel.getFwdLod()
total_rev_lod += indel.getRevLod()
avg_mmr += indel.consMM
avg_ref_mmr += indel.refMM
avg_mmr = avg_mmr/len(indelList)
avg_ref_mmr = avg_ref_mmr/len(indelList)
strand_score = max(total_fwd_lod-total_lod,total_rev_lod-total_lod)
SB = "SB="+str(strand_score)
mismatch = "MMR="+str(avg_mmr)
ref_mismatch = "refMMR="+str(avg_ref_mmr)
return [SB,mismatch,ref_mismatch]
def getGenotypes(samples,indels,alts):
s2i = dict()
alts = alts.split(",")
genotypes = list()
for indel in indels:
s2i[indel.sample] = indel
for sample in samples:
if ( sample in s2i.keys() ):
if ( s2i[sample].isDeletion ):
bases = "D"+str(s2i[sample].bases)
else:
bases = "I"+s2i[sample].bases
gt_index = str(1+alts.index(bases))
if ( s2i[sample].isHet ):
gtstr = "0/"+gt_index
else:
gtstr = gt_index+"/"+gt_index
if ( s2i[sample].isFiltered ):
gtstr += ":1"
else:
gtstr += ":0"
else:
gtstr = "0/0:0"
genotypes.append(gtstr)
return genotypes
def getAlts(indel_list):
alts = list()
for indel in indel_list:
if ( indel.isDeletion ):
if ( not "D"+indel.bases in alts ):
alts.append("D"+indel.bases)
else:
if ( not "I"+indel.bases in alts ):
alts.append("I"+indel.bases)
alt = ",".join(alts)
return alt
def fixAlts(alts_in):
alts = alts_in.split(",")
fixed_alts = list()
for alt in alts:
if ( alt.startswith("D") ):
fixed_alts.append("D"+str(len(alt)-1))
else:
fixed_alts.append(alt)
return ",".join(fixed_alts)
def fixChrom(chrom):
if ( chrom == "23" ):
return "X"
if ( chrom == "24" ):
return "Y"
return chrom
def writeVCFLine(out_stream,indel_list,sample_list):
alts = getAlts(indel_list)
ID = "." ## ignore dbsnp annotation for now
chrom = str(indel_list[0].chrom)
start = str(indel_list[0].start)
if ( indel_list[0].isCoding ):
info = "type=Codon;"
else:
info = "type=Intron;"
info += "count="+str(len(indel_list))
# format is global
def inSampleOrdering(indel1,indel2):
return sample_list.index(indel1.sample)-sample_list.index(indel2.sample)
indel_list.sort(inSampleOrdering)
fixed_alts = fixAlts(alts)
if ( True or not fixed_alts.find(",") > -1 ):
info += ";"+";".join(getAnnotations(indel_list))
entries = [fixChrom(chrom),start,ID,"A",fixed_alts,"50","0",info,format]
for e in getGenotypes(sample_list,indel_list,alts):
entries.append(e)
out_stream.write("\t".join(entries)+"\n")
def startCompare(ind1,ind2):
if ( ind1.chrom != ind2.chrom ):
return ind1.chrom - ind2.chrom
else:
return ind1.start - ind2.start
def output(indels,popname,samples):
outfile = open(output_vcf_name+"_BAD_REFERENCE.vcf",'w')
outfile.write("##VCFv3.2\n##Created by gatherIndelsToVCF.py\n")
outfile.write("\t".join(vcf_header)+"\t"+"\t".join(samples)+"\n")
indels_for_line = list()
for indel in indels:
if ( len(indels_for_line) == 0 or startCompare(indel,indels_for_line[len(indels_for_line)-1]) == 0 ):
indels_for_line.append(indel)
else:
writeVCFLine(outfile,indels_for_line,samples)
indels_for_line = list()
outfile.close()
def parseSample(filename):
return filename.split(".",1)[0]
def parseIndels(filePath,sampleName):
f = open(filePath)
indels = list()
for line in f.readlines():
if ( not line.startswith("[") ):
spline = line.split("\t")
if ( spline[0] != "X" and spline[0] != "Y" ):
chrom = int(spline[0])
else:
if ( spline[0] == "X"):
chrom = 23
else:
chrom = 24
start = int(spline[1])
stop = int(spline[2])
rawbase = spline[3]
isDeletion = rawbase.startswith("-")
if ( isDeletion ):
bases = spline[3].split("-")[1]
else:
bases = spline[3].split("+")[1]
sample = sampleName
isFiltered = False
if ( line.find("AUTOFILTER") > -1 ):
if ( line.find("CONS_AV_MM") > -1 ):
mm = float(line.split("AV_MM[C/R]:")[1].split("/")[0])
if ( mm >= 3.5 ):
isFiltered = True
else:
isFiltered = True
isCoding = line.find("CODING") > -1
isHet = True ## haven't seen a hom yet---todo --- fix this
# STRAND_COUNTS[C/C/R/R]:
strand_counts_field = line.split("STRAND_COUNTS[C/C/R/R]:")[1].split()[0]
strand_counts = strand_counts_field.split("/")
consF = strand_counts[0]
consR = strand_counts[1]
refF = strand_counts[2]
refR = strand_counts[3]
# AV_MM[C/R]:
mismatch_field = line.split("AV_MM[C/R]:")[1].split()[0]
consMM = mismatch_field.split("/")[0]
refMM = mismatch_field.split("/")[1]
# recall order: self,chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding
indels.append(Indel(chrom,start,stop,bases,sample,isHet,isDeletion,isFiltered,isCoding,consF,consR,refF,refR,consMM,refMM))
return indels
def writeVCF(verbose_bed_list):
for file in verbose_bed_list:
samples.append(parseSample(file))
indels.extend(parseIndels(indel_dir+file,parseSample(file)))
indels.sort(startCompare)
output(indels,popname,samples)
writeVCF(input_verbose_beds)
## now dumb hacky thing
cmd = "java -jar /humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar -T VCFReferenceFixer -B fixme,VCF,"+output_vcf_name+"_BAD_REFERENCE.vcf -o "+output_vcf_name+" -R "+reference_file+" -l INFO"
os.system(cmd)