2010-02-06 03:25:46 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
|
|
import sys, FlatFileTable, os
|
|
|
|
|
|
|
|
|
|
if sys.argv < 3:
|
|
|
|
|
print "Usage: AnnotateVCFwithMAF.py VCF_file MAF_file"
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
|
|
|
|
vcf_filename = sys.argv[1]
|
|
|
|
|
maf_filename = sys.argv[2]
|
|
|
|
|
|
|
|
|
|
maf_gen = FlatFileTable.record_generator(maf_filename, "\t")
|
|
|
|
|
|
|
|
|
|
headers=["gene","type","transcript","strand","genomechange","cDNAchange","codonchange","proteinchange"]
|
|
|
|
|
|
|
|
|
|
loci_and_info = []
|
|
|
|
|
|
|
|
|
|
for record in maf_gen:
|
|
|
|
|
#print record
|
|
|
|
|
#info_string = ",".join(["%s=%s" % (header, record[header]) for header in headers])
|
|
|
|
|
info_string = ""
|
|
|
|
|
for index,header in enumerate(headers):
|
|
|
|
|
if record.has_key(header):
|
|
|
|
|
if index > 0:
|
2010-02-17 04:44:57 +08:00
|
|
|
info_string += ";"
|
2010-02-06 03:25:46 +08:00
|
|
|
info_string += "%s=%s" % (header, record[header])
|
|
|
|
|
|
|
|
|
|
locus = record["chr"]+":"+record["start"]
|
|
|
|
|
|
|
|
|
|
#print locus, info_string
|
|
|
|
|
loci_and_info.append((locus, info_string))
|
|
|
|
|
|
|
|
|
|
#vcf_gen = FlatFileTable.record_generator(vcf_file, "\t", 34)
|
|
|
|
|
vcf_file = open(vcf_filename)
|
2010-02-23 05:31:24 +08:00
|
|
|
vcf_out_file = open(os.path.splitext(os.path.basename(vcf_filename))[0]+".maf_annotated.vcf", "w")
|
2010-02-06 03:25:46 +08:00
|
|
|
vcf_format_line = vcf_file.readline()
|
|
|
|
|
vcf_out_file.write(vcf_format_line)
|
2010-07-14 00:31:14 +08:00
|
|
|
if vcf_format_line != "##fileformat=VCFv3.3\n" and vcf_format_line != "##fileformat=VCFv4.0":
|
|
|
|
|
print ("VCF not v 3.3 or v4.0")
|
2010-02-06 03:25:46 +08:00
|
|
|
sys.exit()
|
|
|
|
|
|
2010-02-07 11:38:30 +08:00
|
|
|
header = vcf_file.readline()
|
|
|
|
|
while header != "" and header.startswith("#"):
|
|
|
|
|
if header.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"):
|
|
|
|
|
break
|
|
|
|
|
vcf_out_file.write(header)
|
|
|
|
|
header = vcf_file.readline()
|
2010-02-06 03:25:46 +08:00
|
|
|
|
2010-02-07 11:38:30 +08:00
|
|
|
header_fields = header
|
2010-02-06 03:25:46 +08:00
|
|
|
if not header_fields.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"):
|
|
|
|
|
print ("VCF header fields not in expected order")
|
|
|
|
|
print header_fields
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
2010-02-07 11:38:30 +08:00
|
|
|
vcf_out_file.write("##source=AnnotateVCFwithMAF\n")
|
2010-03-02 07:22:41 +08:00
|
|
|
for header_field in headers:
|
|
|
|
|
vcf_out_file.write("##INFO="+header_field+",1,String,"+header_field+"\n")
|
2010-02-07 11:38:30 +08:00
|
|
|
vcf_out_file.write(header_fields)
|
|
|
|
|
|
2010-07-14 00:31:14 +08:00
|
|
|
def addFormat(infoString):
|
|
|
|
|
# takes MAF info string and reformats values for usefulness and parseablity
|
|
|
|
|
newItems = list()
|
|
|
|
|
for item in infoString.split(";"):
|
|
|
|
|
keyval = item.split("=")
|
|
|
|
|
key = keyval[0]
|
|
|
|
|
val = keyval[1]
|
|
|
|
|
if key == "codonchange" :
|
|
|
|
|
# has the form c.(232-234)CAC>AAC
|
|
|
|
|
# want to strip to just the change
|
|
|
|
|
codon_change = val.split(")")[1]
|
|
|
|
|
numbers = val.split(".")[1].split(")")[0]+")"
|
|
|
|
|
newItems.append("codonchange="+codon_change+";codonoffset="+numbers)
|
|
|
|
|
if key == "proteinchange" :
|
|
|
|
|
# has the form p.H78N
|
|
|
|
|
# want to move to H>N
|
|
|
|
|
first = val.split(".")[1][0]
|
|
|
|
|
last = val[len(val)-1]
|
|
|
|
|
num = val.split(".")[1][1:len(val.split(".")[1])-1]
|
|
|
|
|
newItems.append("proteinchange="+first+">"+last+";proteinoffset="+num)
|
|
|
|
|
return ";".join(newItems)
|
|
|
|
|
|
2010-02-06 03:25:46 +08:00
|
|
|
for vcf_line, locus_and_info in zip(vcf_file.readlines(), loci_and_info):
|
|
|
|
|
vcf_line_fields = vcf_line.split("\t")
|
|
|
|
|
vcf_locus = vcf_line_fields[0]+":"+vcf_line_fields[1]
|
|
|
|
|
#print record
|
|
|
|
|
maf_locus, maf_info = locus_and_info
|
|
|
|
|
if maf_locus != vcf_locus:
|
|
|
|
|
print "ERROR: VCF and MAF loci did not match"
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
2010-07-14 00:31:14 +08:00
|
|
|
vcf_line_fields[7] = vcf_line_fields[7]+";"+addFormat(maf_info)
|
2010-02-06 03:25:46 +08:00
|
|
|
new_vcf_line = "\t".join(vcf_line_fields)
|
|
|
|
|
vcf_out_file.write(new_vcf_line)
|