2023-08-26 00:38:38 +08:00
|
|
|
|
/*********************************************************************************************
|
|
|
|
|
|
Description: Some useful functions
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by NCIC.ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/08/25
|
|
|
|
|
|
***********************************************************************************************/
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
#include <stdint.h>
|
2023-08-27 01:01:57 +08:00
|
|
|
|
#include <stdio.h>
|
2024-04-11 13:29:28 +08:00
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include "utils.h"
|
2025-09-09 16:07:45 +08:00
|
|
|
|
#include "debug.h"
|
2023-08-26 00:38:38 +08:00
|
|
|
|
|
|
|
|
|
|
unsigned char nst_nt4_table[256] = {
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
|
|
|
|
|
|
|
2024-04-11 13:29:28 +08:00
|
|
|
|
const char *BASE = "ACGTN";
|
2023-08-26 00:38:38 +08:00
|
|
|
|
|
|
|
|
|
|
// 将碱基字符转成2位编码
|
2024-04-11 13:29:28 +08:00
|
|
|
|
void base_char_to_int(char *str)
|
2023-08-26 00:38:38 +08:00
|
|
|
|
{
|
|
|
|
|
|
int i;
|
2023-08-27 01:01:57 +08:00
|
|
|
|
const int slen = strlen(str);
|
|
|
|
|
|
for (i = 0; i < slen; ++i)
|
|
|
|
|
|
{
|
2023-08-26 00:38:38 +08:00
|
|
|
|
str[i] = nst_nt4_table[(uint8_t)str[i]];
|
2023-08-27 01:01:57 +08:00
|
|
|
|
}
|
2024-04-11 13:29:28 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* read row_num_to_read lines of query-target info */
|
|
|
|
|
|
/**
|
|
|
|
|
|
* info_num: numbers of int value in one line of info file (2: qlen, tlen or 3: qlen, tlen, init_score)
|
|
|
|
|
|
*/
|
|
|
|
|
|
int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, FILE *fp)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!qti_arr || !buf || !fp) return -1;
|
|
|
|
|
|
if (info_num !=2 && info_num != 3) return -1;
|
|
|
|
|
|
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
|
|
|
|
|
|
|
|
|
|
|
|
int read_row_num = 0;
|
|
|
|
|
|
while (row_num_to_read > 0 && fgets((char *)buf->addr, buf->m, fp) != NULL)
|
|
|
|
|
|
{
|
|
|
|
|
|
// if (strlen((char *)buf->addr) == 1) continue; // skip null line
|
|
|
|
|
|
if (qti_arr->m <= read_row_num) { // expend the seq array
|
|
|
|
|
|
int old_m = qti_arr->m;
|
|
|
|
|
|
kv_resize(qti_t, *qti_arr, old_m == 0 ? 128 : old_m * 2);
|
|
|
|
|
|
memset(&qti_arr->a[old_m], 0, (qti_arr->m - old_m) * sizeof(qti_t)); // set the newly allocated mem to zero
|
|
|
|
|
|
}
|
|
|
|
|
|
qti_t *qti = &kv_A(*qti_arr, qti_arr->n++); // get a new query-target info
|
|
|
|
|
|
if (qti->m < info_num) { kv_resize(int, *qti, info_num); }
|
|
|
|
|
|
if (info_num == 2) sscanf((char *)buf->addr, "%d %d", &qti->a[0], &qti->a[1]);
|
|
|
|
|
|
else sscanf((char *)buf->addr, "%d %d %d", &qti->a[0], &qti->a[1], &qti->a[2]);
|
|
|
|
|
|
// test output
|
|
|
|
|
|
// int i; for (i = 0; i < info_num; ++i) { fprintf(stderr, "%d\t", kv_A(*qti, i)); } fprintf(stderr, "\n");
|
|
|
|
|
|
--row_num_to_read;
|
|
|
|
|
|
++read_row_num;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return read_row_num;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* read row_num_to_read lines of query or target sequence */
|
|
|
|
|
|
int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!seq_arr || !buf || !fp) return -1;
|
|
|
|
|
|
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
|
|
|
|
|
|
|
|
|
|
|
|
int read_row_num = 0;
|
|
|
|
|
|
while (row_num_to_read > 0 && fgets((char*)buf->addr, buf->m, fp) != NULL)
|
|
|
|
|
|
{
|
|
|
|
|
|
int base_size = strlen((char *)buf->addr);
|
|
|
|
|
|
// trim the '\n' char
|
|
|
|
|
|
if ((char)buf->addr[base_size - 1] == '\n') {
|
|
|
|
|
|
buf->addr[base_size - 1] = (uint8_t)'\0';
|
|
|
|
|
|
--base_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
// if (base_size == 0) continue; // skip null line
|
|
|
|
|
|
// fprintf(stderr, "%d %s\n", base_size, (char *) buf->addr);
|
|
|
|
|
|
// test output
|
|
|
|
|
|
// fprintf(stderr, "%s\n", (char *)buf->addr);
|
|
|
|
|
|
if (seq_arr->m <= read_row_num) { // expend the seq array
|
|
|
|
|
|
int old_m = seq_arr->m;
|
|
|
|
|
|
kv_resize(seq_t, *seq_arr, old_m == 0 ? 128 : old_m * 2);
|
|
|
|
|
|
memset(&seq_arr->a[old_m], 0, (seq_arr->m - old_m) * sizeof(seq_t)); // set the newly allocated mem to zero
|
|
|
|
|
|
}
|
|
|
|
|
|
seq_t *seq = &kv_A(*seq_arr, seq_arr->n++); // get a new query-target sequence
|
|
|
|
|
|
if (seq->m < base_size) { kv_resize(uint8_t, *seq, base_size + 16); }
|
|
|
|
|
|
base_char_to_int((char*)buf->addr);
|
|
|
|
|
|
memcpy(seq->a, buf->addr, base_size);
|
|
|
|
|
|
seq->n = base_size;
|
|
|
|
|
|
// test output
|
|
|
|
|
|
// int i; for (i = 0; i < seq->n; ++i) fprintf(stderr, "%d", kv_A(*seq, i)); fprintf(stderr, "\n");
|
|
|
|
|
|
|
|
|
|
|
|
--row_num_to_read;
|
|
|
|
|
|
++read_row_num;
|
|
|
|
|
|
}
|
|
|
|
|
|
return read_row_num;
|
2025-09-09 16:07:45 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
|
|
|
|
|
|
{
|
|
|
|
|
|
#ifdef DEBUG_FILE_OUTPUT
|
|
|
|
|
|
// 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen
|
|
|
|
|
|
FILE *query_f = gfq[fnum],
|
|
|
|
|
|
*target_f = gft[fnum],
|
|
|
|
|
|
*info_f = gfi[fnum];
|
|
|
|
|
|
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
|
|
|
|
|
|
int i;
|
|
|
|
|
|
// 处理query
|
|
|
|
|
|
for (i = 0; i < qlen; ++i)
|
|
|
|
|
|
fprintf(query_f, "%c", seq_map[query[i]]);
|
|
|
|
|
|
fprintf(query_f, "\n");
|
|
|
|
|
|
// 处理target
|
|
|
|
|
|
for (i = 0; i < tlen; ++i)
|
|
|
|
|
|
fprintf(target_f, "%c", seq_map[target[i]]);
|
|
|
|
|
|
fprintf(target_f, "\n");
|
|
|
|
|
|
// 处理其他信息
|
|
|
|
|
|
if (h0 > 0)
|
|
|
|
|
|
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
|
|
|
|
|
|
else
|
|
|
|
|
|
fprintf(info_f, "%-8d%-8d\n", qlen, tlen);
|
|
|
|
|
|
#endif
|
2023-08-26 00:38:38 +08:00
|
|
|
|
}
|