diff --git a/python/SyzygyCallsFileToVCF.py b/python/SyzygyCallsFileToVCF.py new file mode 100755 index 000000000..f70717420 --- /dev/null +++ b/python/SyzygyCallsFileToVCF.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +#import farm_commands +import subprocess +import os +import sys +import re +import time +import math + +dbsnp = "." # can be global. Let something else annotate dbsnp info +filter = "0" # don't declare any filtering in the VCF file + +print(sys.argv) + +raw_calls_file = open(sys.argv[1]) +output_vcf_file = open(sys.argv[2],'w') +pool_name = [] + +try: + pool_name = sys.argv[3] +except IndexError: + # parse the file name + filepath = sys.argv[0].strip().split("/") + pool_name = filepath[len(filepath)] + + +header = raw_calls_file.readline().strip() + +fields = header.split() + +print(fields) + +# parse it for important offsets + +chrompos = fields.index("chr:offset") +ref_offset = fields.index("ref_base") +for_a_index = fields.index("A") +for_c_index = fields.index("C") +for_g_index = fields.index("G") +for_t_index = fields.index("T") +for_d_index = fields.index("D") +for_i_index = fields.index("I") +for_depth_index = fields.index("sum") +rev_a_index = fields.index("AR") +rev_c_index = fields.index("CR") +rev_g_index = fields.index("GR") +rev_t_index = fields.index("TR") +rev_d_index = fields.index("DR") +rev_i_index = fields.index("IR") +rev_depth_index = fields.index("sumr") +total_depth_index = fields.index("combined_sum") +lod_score_index = fields.index("combined_lod") +call_index = fields.index("allele2") +fish_flag = fields.index("flag") +fish_pval_index = fields.index("fisher-pval") + +# now print the VCF header +head1 = "##source=Syzygy" +head2 = "##format=VCRv3.2" +fields = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",pool_name] +head3 = "#" + "\t".join(fields) + +output_vcf_file.write(head1+"\n") +output_vcf_file.write(head2+"\n") +output_vcf_file.write(head3+"\n") + +def getProportionNonref(list): + total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1]) + ref_base = list[ref_offset] + if ( ref_base == "A" ): + ref_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1]) + elif ( ref_base == "C"): + ref_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1]) + elif ( ref_base == "G"): + ref_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1]) + else: + ref_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1]) + + return 1.0 - ( float(ref_bases) / float(total_bases) ) + +def generateVCFLine(chrom, pos, db, ref, alt, filt, qual, INFO): + # make the info into a single string + infoString = "" + for keval in INFO: + info = "=".join(keval) + infoString = infoString+info + format = "GT:GQ" + genotype = "0/1:"+qual + all_fields = [chrom, pos, db, ref, alt, qual, filt, infoString, format, genotype] + return "\t".join(all_fields) + +# instantiate a line buffer + +two_lines_ago = []; +previous_line = []; +this_line = []; +next_line = []; +line_after_next = []; + +# read through the file + +for line in raw_calls_file.readlines(): + spline = line.strip().split() + + # iterate through the lines + + two_lines_ago = previous_line + previous_line = this_line + this_line = next_line + next_line = line_after_next + line_after_next = spline + + # window has been updated + + if ( not this_line ): + continue # window not filled yet + else: + if ( float(this_line[lod_score_index]) > 0 and this_line[call_index] != "D" and this_line[call_index] != "I"): + # potential call + chrom = this_line[chrompos].split(":")[0] + pos = this_line[chrompos].split(":")[1] + ref = this_line[ref_offset] + alt = this_line[call_index] + lod = this_line[lod_score_index] + + # standard vcf info made -- now add INFO + # syzy depth + syz_depth = this_line[total_depth_index] + # syzy strand bias + syz_sb = "" + if ( this_line[fish_flag] == "NA" ): + syz_sb = "-1" + else: + syz_sb = this_line[fish_pval_index] + # syzy neighborhood mismatch rate + next_mmr = getProportionNonref(next_line) + after_next_mmr = getProportionNonref(line_after_next) + prev_mmr = getProportionNonref(previous_line) + before_last_mmr = getProportionNonref(two_lines_ago) + syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr) + # turn these into key value pairs + INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr]] # semicolons are a complete hack + # get the vcf line + vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, lod, INFO) + # print the sucker + output_vcf_file.write(vcfLine+"\n")