2009-12-05 00:07:06 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
#import farm_commands
|
|
|
|
|
import subprocess
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import re
|
|
|
|
|
import time
|
|
|
|
|
import math
|
|
|
|
|
|
|
|
|
|
dbsnp = "." # can be global. Let something else annotate dbsnp info
|
|
|
|
|
filter = "0" # don't declare any filtering in the VCF file
|
|
|
|
|
|
2009-12-05 00:17:38 +08:00
|
|
|
#print(sys.argv)
|
2009-12-05 00:07:06 +08:00
|
|
|
|
|
|
|
|
raw_calls_file = open(sys.argv[1])
|
|
|
|
|
output_vcf_file = open(sys.argv[2],'w')
|
|
|
|
|
pool_name = []
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
pool_name = sys.argv[3]
|
|
|
|
|
except IndexError:
|
|
|
|
|
# parse the file name
|
|
|
|
|
filepath = sys.argv[0].strip().split("/")
|
|
|
|
|
pool_name = filepath[len(filepath)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
header = raw_calls_file.readline().strip()
|
|
|
|
|
|
|
|
|
|
fields = header.split()
|
|
|
|
|
|
2009-12-05 00:17:38 +08:00
|
|
|
#print(fields)
|
2009-12-05 00:07:06 +08:00
|
|
|
|
|
|
|
|
# parse it for important offsets
|
|
|
|
|
|
|
|
|
|
chrompos = fields.index("chr:offset")
|
|
|
|
|
ref_offset = fields.index("ref_base")
|
|
|
|
|
for_a_index = fields.index("A")
|
|
|
|
|
for_c_index = fields.index("C")
|
|
|
|
|
for_g_index = fields.index("G")
|
|
|
|
|
for_t_index = fields.index("T")
|
|
|
|
|
for_d_index = fields.index("D")
|
|
|
|
|
for_i_index = fields.index("I")
|
|
|
|
|
for_depth_index = fields.index("sum")
|
|
|
|
|
rev_a_index = fields.index("AR")
|
|
|
|
|
rev_c_index = fields.index("CR")
|
|
|
|
|
rev_g_index = fields.index("GR")
|
|
|
|
|
rev_t_index = fields.index("TR")
|
|
|
|
|
rev_d_index = fields.index("DR")
|
|
|
|
|
rev_i_index = fields.index("IR")
|
|
|
|
|
rev_depth_index = fields.index("sumr")
|
|
|
|
|
total_depth_index = fields.index("combined_sum")
|
|
|
|
|
lod_score_index = fields.index("combined_lod")
|
|
|
|
|
call_index = fields.index("allele2")
|
|
|
|
|
fish_flag = fields.index("flag")
|
|
|
|
|
fish_pval_index = fields.index("fisher-pval")
|
|
|
|
|
|
|
|
|
|
# now print the VCF header
|
|
|
|
|
head1 = "##source=Syzygy"
|
|
|
|
|
head2 = "##format=VCRv3.2"
|
|
|
|
|
fields = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",pool_name]
|
|
|
|
|
head3 = "#" + "\t".join(fields)
|
|
|
|
|
|
|
|
|
|
output_vcf_file.write(head1+"\n")
|
|
|
|
|
output_vcf_file.write(head2+"\n")
|
|
|
|
|
output_vcf_file.write(head3+"\n")
|
|
|
|
|
|
|
|
|
|
def getProportionNonref(list):
|
|
|
|
|
total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1])
|
|
|
|
|
ref_base = list[ref_offset]
|
|
|
|
|
if ( ref_base == "A" ):
|
|
|
|
|
ref_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1])
|
|
|
|
|
elif ( ref_base == "C"):
|
|
|
|
|
ref_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1])
|
|
|
|
|
elif ( ref_base == "G"):
|
|
|
|
|
ref_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1])
|
|
|
|
|
else:
|
|
|
|
|
ref_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1])
|
|
|
|
|
|
2009-12-05 00:17:38 +08:00
|
|
|
return 1.0 - ( float(ref_bases+1) / float(total_bases+1) )
|
2009-12-05 00:07:06 +08:00
|
|
|
|
|
|
|
|
def generateVCFLine(chrom, pos, db, ref, alt, filt, qual, INFO):
|
|
|
|
|
# make the info into a single string
|
|
|
|
|
infoString = ""
|
|
|
|
|
for keval in INFO:
|
|
|
|
|
info = "=".join(keval)
|
|
|
|
|
infoString = infoString+info
|
|
|
|
|
format = "GT:GQ"
|
|
|
|
|
genotype = "0/1:"+qual
|
|
|
|
|
all_fields = [chrom, pos, db, ref, alt, qual, filt, infoString, format, genotype]
|
|
|
|
|
return "\t".join(all_fields)
|
|
|
|
|
|
|
|
|
|
# instantiate a line buffer
|
|
|
|
|
|
|
|
|
|
two_lines_ago = [];
|
|
|
|
|
previous_line = [];
|
|
|
|
|
this_line = [];
|
|
|
|
|
next_line = [];
|
|
|
|
|
line_after_next = [];
|
|
|
|
|
|
|
|
|
|
# read through the file
|
|
|
|
|
|
|
|
|
|
for line in raw_calls_file.readlines():
|
|
|
|
|
spline = line.strip().split()
|
|
|
|
|
|
|
|
|
|
# iterate through the lines
|
|
|
|
|
|
|
|
|
|
two_lines_ago = previous_line
|
|
|
|
|
previous_line = this_line
|
|
|
|
|
this_line = next_line
|
|
|
|
|
next_line = line_after_next
|
|
|
|
|
line_after_next = spline
|
|
|
|
|
|
|
|
|
|
# window has been updated
|
|
|
|
|
|
|
|
|
|
if ( not this_line ):
|
|
|
|
|
continue # window not filled yet
|
|
|
|
|
else:
|
|
|
|
|
if ( float(this_line[lod_score_index]) > 0 and this_line[call_index] != "D" and this_line[call_index] != "I"):
|
|
|
|
|
# potential call
|
|
|
|
|
chrom = this_line[chrompos].split(":")[0]
|
|
|
|
|
pos = this_line[chrompos].split(":")[1]
|
|
|
|
|
ref = this_line[ref_offset]
|
|
|
|
|
alt = this_line[call_index]
|
|
|
|
|
lod = this_line[lod_score_index]
|
|
|
|
|
|
|
|
|
|
# standard vcf info made -- now add INFO
|
|
|
|
|
# syzy depth
|
|
|
|
|
syz_depth = this_line[total_depth_index]
|
|
|
|
|
# syzy strand bias
|
|
|
|
|
syz_sb = ""
|
|
|
|
|
if ( this_line[fish_flag] == "NA" ):
|
|
|
|
|
syz_sb = "-1"
|
|
|
|
|
else:
|
|
|
|
|
syz_sb = this_line[fish_pval_index]
|
|
|
|
|
# syzy neighborhood mismatch rate
|
|
|
|
|
next_mmr = getProportionNonref(next_line)
|
|
|
|
|
after_next_mmr = getProportionNonref(line_after_next)
|
|
|
|
|
prev_mmr = getProportionNonref(previous_line)
|
|
|
|
|
before_last_mmr = getProportionNonref(two_lines_ago)
|
|
|
|
|
syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr)
|
|
|
|
|
# turn these into key value pairs
|
|
|
|
|
INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr]] # semicolons are a complete hack
|
|
|
|
|
# get the vcf line
|
|
|
|
|
vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, lod, INFO)
|
|
|
|
|
# print the sucker
|
|
|
|
|
output_vcf_file.write(vcfLine+"\n")
|