A brute-force script to convert Syzygy lod-score calls files into a proper VCF -- with some useful annotations.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2261 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1e8dcc30da
commit
0c2d6d7e41
|
|
@ -0,0 +1,146 @@
|
|||
#!/usr/bin/env python
|
||||
#import farm_commands
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
import math
|
||||
|
||||
dbsnp = "." # can be global. Let something else annotate dbsnp info
|
||||
filter = "0" # don't declare any filtering in the VCF file
|
||||
|
||||
print(sys.argv)
|
||||
|
||||
raw_calls_file = open(sys.argv[1])
|
||||
output_vcf_file = open(sys.argv[2],'w')
|
||||
pool_name = []
|
||||
|
||||
try:
|
||||
pool_name = sys.argv[3]
|
||||
except IndexError:
|
||||
# parse the file name
|
||||
filepath = sys.argv[0].strip().split("/")
|
||||
pool_name = filepath[len(filepath)]
|
||||
|
||||
|
||||
header = raw_calls_file.readline().strip()
|
||||
|
||||
fields = header.split()
|
||||
|
||||
print(fields)
|
||||
|
||||
# parse it for important offsets
|
||||
|
||||
chrompos = fields.index("chr:offset")
|
||||
ref_offset = fields.index("ref_base")
|
||||
for_a_index = fields.index("A")
|
||||
for_c_index = fields.index("C")
|
||||
for_g_index = fields.index("G")
|
||||
for_t_index = fields.index("T")
|
||||
for_d_index = fields.index("D")
|
||||
for_i_index = fields.index("I")
|
||||
for_depth_index = fields.index("sum")
|
||||
rev_a_index = fields.index("AR")
|
||||
rev_c_index = fields.index("CR")
|
||||
rev_g_index = fields.index("GR")
|
||||
rev_t_index = fields.index("TR")
|
||||
rev_d_index = fields.index("DR")
|
||||
rev_i_index = fields.index("IR")
|
||||
rev_depth_index = fields.index("sumr")
|
||||
total_depth_index = fields.index("combined_sum")
|
||||
lod_score_index = fields.index("combined_lod")
|
||||
call_index = fields.index("allele2")
|
||||
fish_flag = fields.index("flag")
|
||||
fish_pval_index = fields.index("fisher-pval")
|
||||
|
||||
# now print the VCF header
|
||||
head1 = "##source=Syzygy"
|
||||
head2 = "##format=VCRv3.2"
|
||||
fields = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",pool_name]
|
||||
head3 = "#" + "\t".join(fields)
|
||||
|
||||
output_vcf_file.write(head1+"\n")
|
||||
output_vcf_file.write(head2+"\n")
|
||||
output_vcf_file.write(head3+"\n")
|
||||
|
||||
def getProportionNonref(list):
|
||||
total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1])
|
||||
ref_base = list[ref_offset]
|
||||
if ( ref_base == "A" ):
|
||||
ref_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1])
|
||||
elif ( ref_base == "C"):
|
||||
ref_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1])
|
||||
elif ( ref_base == "G"):
|
||||
ref_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1])
|
||||
else:
|
||||
ref_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1])
|
||||
|
||||
return 1.0 - ( float(ref_bases) / float(total_bases) )
|
||||
|
||||
def generateVCFLine(chrom, pos, db, ref, alt, filt, qual, INFO):
|
||||
# make the info into a single string
|
||||
infoString = ""
|
||||
for keval in INFO:
|
||||
info = "=".join(keval)
|
||||
infoString = infoString+info
|
||||
format = "GT:GQ"
|
||||
genotype = "0/1:"+qual
|
||||
all_fields = [chrom, pos, db, ref, alt, qual, filt, infoString, format, genotype]
|
||||
return "\t".join(all_fields)
|
||||
|
||||
# instantiate a line buffer
|
||||
|
||||
two_lines_ago = [];
|
||||
previous_line = [];
|
||||
this_line = [];
|
||||
next_line = [];
|
||||
line_after_next = [];
|
||||
|
||||
# read through the file
|
||||
|
||||
for line in raw_calls_file.readlines():
|
||||
spline = line.strip().split()
|
||||
|
||||
# iterate through the lines
|
||||
|
||||
two_lines_ago = previous_line
|
||||
previous_line = this_line
|
||||
this_line = next_line
|
||||
next_line = line_after_next
|
||||
line_after_next = spline
|
||||
|
||||
# window has been updated
|
||||
|
||||
if ( not this_line ):
|
||||
continue # window not filled yet
|
||||
else:
|
||||
if ( float(this_line[lod_score_index]) > 0 and this_line[call_index] != "D" and this_line[call_index] != "I"):
|
||||
# potential call
|
||||
chrom = this_line[chrompos].split(":")[0]
|
||||
pos = this_line[chrompos].split(":")[1]
|
||||
ref = this_line[ref_offset]
|
||||
alt = this_line[call_index]
|
||||
lod = this_line[lod_score_index]
|
||||
|
||||
# standard vcf info made -- now add INFO
|
||||
# syzy depth
|
||||
syz_depth = this_line[total_depth_index]
|
||||
# syzy strand bias
|
||||
syz_sb = ""
|
||||
if ( this_line[fish_flag] == "NA" ):
|
||||
syz_sb = "-1"
|
||||
else:
|
||||
syz_sb = this_line[fish_pval_index]
|
||||
# syzy neighborhood mismatch rate
|
||||
next_mmr = getProportionNonref(next_line)
|
||||
after_next_mmr = getProportionNonref(line_after_next)
|
||||
prev_mmr = getProportionNonref(previous_line)
|
||||
before_last_mmr = getProportionNonref(two_lines_ago)
|
||||
syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr)
|
||||
# turn these into key value pairs
|
||||
INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr]] # semicolons are a complete hack
|
||||
# get the vcf line
|
||||
vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, lod, INFO)
|
||||
# print the sucker
|
||||
output_vcf_file.write(vcfLine+"\n")
|
||||
Loading…
Reference in New Issue