#!/usr/bin/env python

import sys, FlatFileTable, os

if sys.argv < 3:
    print "Usage: AnnotateVCFwithMAF.py VCF_file MAF_file"
    sys.exit()
    
vcf_filename = sys.argv[1]
maf_filename = sys.argv[2]

maf_gen = FlatFileTable.record_generator(maf_filename, "\t")

headers=["gene","type","transcript","strand","genomechange","cDNAchange","codonchange","proteinchange"]

loci_and_info = []

for record in maf_gen:
    #print record
    #info_string = ",".join(["%s=%s" % (header, record[header]) for header in headers])
    info_string = ""
    for index,header in enumerate(headers):
        if record.has_key(header):
            if index > 0:
                info_string += ";"
            info_string += "%s=%s" % (header, record[header])

    locus = record["chr"]+":"+record["start"]
            
    #print locus, info_string
    loci_and_info.append((locus, info_string))
    
#vcf_gen = FlatFileTable.record_generator(vcf_file, "\t", 34)
vcf_file = open(vcf_filename)
vcf_out_file = open(os.path.splitext(os.path.basename(vcf_filename))[0]+".maf_annotated.vcf", "w")
vcf_format_line = vcf_file.readline()
vcf_out_file.write(vcf_format_line)
if vcf_format_line != "##fileformat=VCFv3.3\n" and vcf_format_line != "##fileformat=VCFv4.0":
    print ("VCF not v 3.3 or v4.0")
    sys.exit()

header = vcf_file.readline()
while header != "" and header.startswith("#"):
    if header.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"):
        break
    vcf_out_file.write(header)
    header = vcf_file.readline()

header_fields = header
if not header_fields.startswith("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"):
    print ("VCF header fields not in expected order")
    print header_fields
    sys.exit()

vcf_out_file.write("##source=AnnotateVCFwithMAF\n")
for header_field in headers:
    vcf_out_file.write("##INFO="+header_field+",1,String,"+header_field+"\n")
vcf_out_file.write(header_fields)

def addFormat(infoString):
    # takes MAF info string and reformats values for usefulness and parseablity
    newItems = list()
    for item in infoString.split(";"):
        keyval = item.split("=")
        key = keyval[0]
        val = keyval[1]
        if key == "codonchange" :
            # has the form c.(232-234)CAC>AAC
            # want to strip to just the change
            codon_change = val.split(")")[1]
            numbers = val.split(".")[1].split(")")[0]+")"
            newItems.append("codonchange="+codon_change+";codonoffset="+numbers)
        if key == "proteinchange" :
            # has the form p.H78N
            # want to move to H>N
            first = val.split(".")[1][0]
            last = val[len(val)-1]
            num = val.split(".")[1][1:len(val.split(".")[1])-1]
            newItems.append("proteinchange="+first+">"+last+";proteinoffset="+num)
        if key == "type" :
            newItems.append(item)

    return ";".join(newItems)

for vcf_line, locus_and_info in zip(vcf_file.readlines(), loci_and_info):
    vcf_line_fields = vcf_line.split("\t")
    vcf_locus = vcf_line_fields[0]+":"+vcf_line_fields[1]
    #print record
    maf_locus, maf_info = locus_and_info
    if maf_locus != vcf_locus:
        print "ERROR: VCF and MAF loci did not match"
        sys.exit()
    
    vcf_line_fields[7] = vcf_line_fields[7]+";"+addFormat(maf_info)
    new_vcf_line = "\t".join(vcf_line_fields)
    vcf_out_file.write(new_vcf_line)