Minor changes (additional info calculated)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2522 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fdd14e1a01
commit
dfe160ff77
|
|
@ -31,28 +31,31 @@ fields = header.split()
|
||||||
#print(fields)
|
#print(fields)
|
||||||
|
|
||||||
# parse it for important offsets
|
# parse it for important offsets
|
||||||
|
try:
|
||||||
chrompos = fields.index("chr:offset")
|
chrompos = fields.index("chr:offset")
|
||||||
ref_offset = fields.index("ref_base")
|
ref_offset = fields.index("ref_base")
|
||||||
for_a_index = fields.index("A")
|
for_a_index = fields.index("A")
|
||||||
for_c_index = fields.index("C")
|
for_c_index = fields.index("C")
|
||||||
for_g_index = fields.index("G")
|
for_g_index = fields.index("G")
|
||||||
for_t_index = fields.index("T")
|
for_t_index = fields.index("T")
|
||||||
for_d_index = fields.index("D")
|
for_d_index = fields.index("D")
|
||||||
for_i_index = fields.index("I")
|
for_i_index = fields.index("I")
|
||||||
for_depth_index = fields.index("sum")
|
for_depth_index = fields.index("sum")
|
||||||
rev_a_index = fields.index("AR")
|
rev_a_index = fields.index("AR")
|
||||||
rev_c_index = fields.index("CR")
|
rev_c_index = fields.index("CR")
|
||||||
rev_g_index = fields.index("GR")
|
rev_g_index = fields.index("GR")
|
||||||
rev_t_index = fields.index("TR")
|
rev_t_index = fields.index("TR")
|
||||||
rev_d_index = fields.index("DR")
|
rev_d_index = fields.index("DR")
|
||||||
rev_i_index = fields.index("IR")
|
rev_i_index = fields.index("IR")
|
||||||
rev_depth_index = fields.index("sumr")
|
rev_depth_index = fields.index("sumr")
|
||||||
total_depth_index = fields.index("combined_sum")
|
total_depth_index = fields.index("combined_sum")
|
||||||
lod_score_index = fields.index("combined_lod")
|
lod_score_index = fields.index("combined_lod")
|
||||||
call_index = fields.index("allele2")
|
call_index = fields.index("allele2")
|
||||||
fish_flag = fields.index("flag")
|
fish_flag = fields.index("flag")
|
||||||
fish_pval_index = fields.index("fisher-pval")
|
fish_pval_index = fields.index("fisher-pval")
|
||||||
|
except ValueError:
|
||||||
|
print("ERROR: no index for a header field in call file. Initial line was")
|
||||||
|
print(header)
|
||||||
|
|
||||||
# now print the VCF header
|
# now print the VCF header
|
||||||
head1 = "##source=Syzygy"
|
head1 = "##source=Syzygy"
|
||||||
|
|
@ -64,6 +67,27 @@ output_vcf_file.write(head1+"\n")
|
||||||
output_vcf_file.write(head2+"\n")
|
output_vcf_file.write(head2+"\n")
|
||||||
output_vcf_file.write(head3+"\n")
|
output_vcf_file.write(head3+"\n")
|
||||||
|
|
||||||
|
def getProportionNonrefThatAreSNP(list):
|
||||||
|
ref_base = list[ref_offset]
|
||||||
|
a_bases = int(list[for_a_index].split(":")[1]) + int(list[rev_a_index].split(":")[1])
|
||||||
|
c_bases = int(list[for_c_index].split(":")[1]) + int(list[rev_c_index].split(":")[1])
|
||||||
|
g_bases = int(list[for_g_index].split(":")[1]) + int(list[rev_g_index].split(":")[1])
|
||||||
|
t_bases = int(list[for_t_index].split(":")[1]) + int(list[rev_t_index].split(":")[1])
|
||||||
|
if ( ref_base == "A" ):
|
||||||
|
snp_base_counts = max(c_bases,g_bases,t_bases)
|
||||||
|
nonref_counts = c_bases+g_bases+t_bases
|
||||||
|
elif ( ref_base == "C"):
|
||||||
|
snp_base_counts = max(a_bases,g_bases,t_bases)
|
||||||
|
nonref_counts = a_bases+g_bases+t_bases
|
||||||
|
elif ( ref_base == "G"):
|
||||||
|
snp_base_counts = max(a_bases,c_bases,t_bases)
|
||||||
|
nonref_counts = a_bases+c_bases+t_bases
|
||||||
|
else:
|
||||||
|
snp_base_counts = max(a_bases,c_bases,g_bases)
|
||||||
|
nonref_counts = a_bases+c_bases+g_bases
|
||||||
|
|
||||||
|
return ( float(snp_base_counts + 1)/float(nonref_counts+1) )
|
||||||
|
|
||||||
def getProportionNonref(list):
|
def getProportionNonref(list):
|
||||||
total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1])
|
total_bases = int(list[total_depth_index]) - int(list[rev_i_index].split(":")[1]) - int(list[rev_d_index].split(":")[1]) - int(list[for_i_index].split(":")[1]) - int(list[for_d_index].split(":")[1])
|
||||||
ref_base = list[ref_offset]
|
ref_base = list[ref_offset]
|
||||||
|
|
@ -115,7 +139,17 @@ for line in raw_calls_file.readlines():
|
||||||
if ( not two_lines_ago ):
|
if ( not two_lines_ago ):
|
||||||
continue # window not filled yet
|
continue # window not filled yet
|
||||||
else:
|
else:
|
||||||
if ( float(this_line[lod_score_index]) > 1 and this_line[call_index] != "D" and this_line[call_index] != "I"):
|
try:
|
||||||
|
is_potential_call = float(this_line[lod_score_index]) > 1 and this_line[call_index] != "D" and this_line[call_index] != "I"
|
||||||
|
except IndexError as err:
|
||||||
|
print(this_line)
|
||||||
|
print(lod_score_index)
|
||||||
|
raise IndexError(str(err))
|
||||||
|
except ValueError as err:
|
||||||
|
print(this_line)
|
||||||
|
print(lod_score_index)
|
||||||
|
raise(ValueError(str(err)))
|
||||||
|
if ( is_potential_call):
|
||||||
# potential call
|
# potential call
|
||||||
chrom = this_line[chrompos].split(":")[0]
|
chrom = this_line[chrompos].split(":")[0]
|
||||||
pos = this_line[chrompos].split(":")[1]
|
pos = this_line[chrompos].split(":")[1]
|
||||||
|
|
@ -136,14 +170,15 @@ for line in raw_calls_file.readlines():
|
||||||
# do we want any other kind of annotations here ??
|
# do we want any other kind of annotations here ??
|
||||||
|
|
||||||
# syzy neighborhood mismatch rate
|
# syzy neighborhood mismatch rate
|
||||||
|
pileup_noise = str(1.0-getProportionNonrefThatAreSNP(this_line))
|
||||||
next_mmr = getProportionNonref(next_line) # from later in genome
|
next_mmr = getProportionNonref(next_line) # from later in genome
|
||||||
after_next_mmr = getProportionNonref(line_after_next) # from same
|
after_next_mmr = getProportionNonref(line_after_next) # from same
|
||||||
prev_mmr = getProportionNonref(previous_line) # from earlier
|
prev_mmr = getProportionNonref(previous_line) # from earlier
|
||||||
before_last_mmr = getProportionNonref(two_lines_ago) # same
|
before_last_mmr = getProportionNonref(two_lines_ago) # same
|
||||||
syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr)
|
syzy_nmmr = str(next_mmr+after_next_mmr+prev_mmr+before_last_mmr)
|
||||||
# turn these into key value pairs
|
# turn these into key value pairs
|
||||||
INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr]] # semicolons are a complete hack
|
INFO = [["syzy_DP",syz_depth],[";syzy_SB",syz_sb],[";syzy_NMMR",syzy_nmmr],[";syzy_residMismatch",pileup_noise],[";syzy_originalLOD",lod]] # semicolons are a complete hack
|
||||||
# get the vcf line
|
# get the vcf line
|
||||||
vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, lod, INFO)
|
vcfLine = generateVCFLine(chrom, pos, dbsnp, ref, alt, filter, str(float(lod)*10.0), INFO)
|
||||||
# print the sucker
|
# print the sucker
|
||||||
output_vcf_file.write(vcfLine+"\n")
|
output_vcf_file.write(vcfLine+"\n")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue