#!/usr/bin/env python #import farm_commands import subprocess import os import sys import re import time import math dbsnp = "." # can be global. Let something else annotate dbsnp info filter = "0" # don't declare any filtering in the VCF file #print(sys.argv) raw_calls_file = open(sys.argv[1]) output_vcf_file = open(sys.argv[2],'w') pool_name = [] try: pool_name = sys.argv[3] except IndexError: # parse the file name filepath = sys.argv[0].strip().split("/") pool_name = filepath[len(filepath)] header = raw_calls_file.readline().strip() fields = header.split() #print(fields) # parse it for important offsets try: chrompos = fields.index("chr:offset") ref_offset = fields.index("ref_base") for_a_index = fields.index("A") for_c_index = fields.index("C") for_g_index = fields.index("G") for_t_index = fields.index("T") for_d_index = fields.index("D") for_i_index = fields.index("I") for_depth_index = fields.index("sum") rev_a_index = fields.index("AR") rev_c_index = fields.index("CR") rev_g_index = fields.index("GR") rev_t_index = fields.index("TR") rev_d_index = fields.index("DR") rev_i_index = fields.index("IR") rev_depth_index = fields.index("sumr") total_depth_index = fields.index("combined_sum") lod_score_index = fields.index("combined_lod") call_index = fields.index("allele2") fish_flag = fields.index("flag") fish_pval_index = fields.index("fisher-pval") except ValueError: print("ERROR: no index for a header field in call file. Initial line was") print(header) # now print the VCF header head1 = "##source=Syzygy" head2 = "##format=VCRv3.2" fields = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",pool_name] head3 = "#" + "\t".join(fields) output_vcf_file.write(head1+"\n") output_vcf_file.write(head2+"\n") output_vcf_file.write(head3+"\n") def getProportionNonrefThatAreSNP(list): ref_base = list[ref_offset] a_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1]) c_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1]) g_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1]) t_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1]) if ( ref_base == "A" ): snp_base_counts = max(c_bases,g_bases,t_bases) nonref_counts = c_bases+g_bases+t_bases elif ( ref_base == "C"): snp_base_counts = max(a_bases,g_bases,t_bases) nonref_counts = a_bases+g_bases+t_bases elif ( ref_base == "G"): snp_base_counts = max(a_bases,c_bases,t_bases) nonref_counts = a_bases+c_bases+t_bases else: snp_base_counts = max(a_bases,c_bases,g_bases) nonref_counts = a_bases+c_bases+g_bases return ( float(snp_base_counts + 1)/float(nonref_counts+1) ) def getProportionNonref(list): total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1]) ref_base = list[ref_offset] if ( ref_base == "A" ): ref_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1]) elif ( ref_base == "C"): ref_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1]) elif ( ref_base == "G"): ref_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1]) else: ref_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1]) return 1.0 - ( float(ref_bases+1) / float(total_bases+1) ) def generateVCFLine(chrom, pos, db, ref, alt, filt, qual, INFO): # make the info into a single string infoString = "" for keval in INFO: info = "=".join(keval) infoString = infoString+info format = "GT:GQ" genotype = "0/1:"+qual all_fields = [chrom, pos, db, ref, alt, qual, filt, infoString, format, genotype] return "\t".join(all_fields) # instantiate a line buffer two_lines_ago = []; previous_line = []; this_line = []; next_line = []; line_after_next = []; # read through the file for line in raw_calls_file.readlines(): spline = line.strip().split() # iterate through the lines two_lines_ago = previous_line previous_line = this_line this_line = next_line next_line = line_after_next line_after_next = spline # window has been updated if ( not two_lines_ago ): continue # window not filled yet else: try: is_potential_call = float(this_line[lod_score_index]) > 1 and this_line[call_index] != "D" and this_line[call_index] != "I" except IndexError as err: print(this_line) print(lod_score_index) raise IndexError(str(err)) except ValueError as err: print(this_line) print(lod_score_index) raise(ValueError(str(err))) if ( is_potential_call): # potential call chrom = this_line[chrompos].split(":")[0] pos = this_line[chrompos].split(":")[1] ref = this_line[ref_offset] alt = this_line[call_index] lod = this_line[lod_score_index] # standard vcf info made -- now add INFO # syzy depth syz_depth = this_line[total_depth_index] # syzy strand bias syz_sb = "" if ( this_line[fish_flag] == "NA" ): syz_sb = "-1" else: syz_sb = this_line[fish_pval_index] # do we want any other kind of annotations here ?? # syzy neighborhood mismatch rate pileup_noise = str(1.0-getProportionNonrefThatAreSNP(this_line)) next_mmr = getProportionNonref(next_line) # from later in genome after_next_mmr = getProportionNonref(line_after_next) # from same prev_mmr = getProportionNonref(previous_line) # from earlier before_last_mmr = getProportionNonref(two_lines_ago) # same syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr) # turn these into key value pairs INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr],[";syzy_residMismatch",pileup_noise],[";syzy_originalLOD",lod]] # semicolons are a complete hack # get the vcf line vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, str(float(lod)*10.0), INFO) # print the sucker output_vcf_file.write(vcfLine+"\n")