sw_perf/utils.c

141 lines
5.4 KiB
C
Raw Normal View History

2023-08-26 00:38:38 +08:00
/*********************************************************************************************
Description: Some useful functions
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/25
***********************************************************************************************/
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "utils.h"
2025-09-18 15:09:48 +08:00
#include "debug.h"
2023-08-26 00:38:38 +08:00
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
const char *BASE = "ACGTN";
2023-08-26 00:38:38 +08:00
// 将碱基字符转成2位编码
void base_char_to_int(char *str)
2023-08-26 00:38:38 +08:00
{
int i;
const int slen = strlen(str);
for (i = 0; i < slen; ++i)
{
2023-08-26 00:38:38 +08:00
str[i] = nst_nt4_table[(uint8_t)str[i]];
}
}
/* read row_num_to_read lines of query-target info */
/**
* info_num: numbers of int value in one line of info file (2: qlen, tlen or 3: qlen, tlen, init_score)
*/
int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, FILE *fp)
{
if (!qti_arr || !buf || !fp) return -1;
if (info_num !=2 && info_num != 3) return -1;
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
int read_row_num = 0;
while (row_num_to_read > 0 && fgets((char *)buf->addr, buf->m, fp) != NULL)
{
// if (strlen((char *)buf->addr) == 1) continue; // skip null line
if (qti_arr->m <= read_row_num) { // expend the seq array
int old_m = qti_arr->m;
kv_resize(qti_t, *qti_arr, old_m == 0 ? 128 : old_m * 2);
memset(&qti_arr->a[old_m], 0, (qti_arr->m - old_m) * sizeof(qti_t)); // set the newly allocated mem to zero
}
qti_t *qti = &kv_A(*qti_arr, qti_arr->n++); // get a new query-target info
if (qti->m < info_num) { kv_resize(int, *qti, info_num); }
if (info_num == 2) sscanf((char *)buf->addr, "%d %d", &qti->a[0], &qti->a[1]);
else sscanf((char *)buf->addr, "%d %d %d", &qti->a[0], &qti->a[1], &qti->a[2]);
// test output
// int i; for (i = 0; i < info_num; ++i) { fprintf(stderr, "%d\t", kv_A(*qti, i)); } fprintf(stderr, "\n");
--row_num_to_read;
++read_row_num;
}
return read_row_num;
}
/* read row_num_to_read lines of query or target sequence */
int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp)
{
if (!seq_arr || !buf || !fp) return -1;
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
int read_row_num = 0;
while (row_num_to_read > 0 && fgets((char*)buf->addr, buf->m, fp) != NULL)
{
int base_size = strlen((char *)buf->addr);
// trim the '\n' char
if ((char)buf->addr[base_size - 1] == '\n') {
buf->addr[base_size - 1] = (uint8_t)'\0';
--base_size;
}
// if (base_size == 0) continue; // skip null line
// fprintf(stderr, "%d %s\n", base_size, (char *) buf->addr);
// test output
// fprintf(stderr, "%s\n", (char *)buf->addr);
if (seq_arr->m <= read_row_num) { // expend the seq array
int old_m = seq_arr->m;
kv_resize(seq_t, *seq_arr, old_m == 0 ? 128 : old_m * 2);
memset(&seq_arr->a[old_m], 0, (seq_arr->m - old_m) * sizeof(seq_t)); // set the newly allocated mem to zero
}
seq_t *seq = &kv_A(*seq_arr, seq_arr->n++); // get a new query-target sequence
if (seq->m < base_size) { kv_resize(uint8_t, *seq, base_size + 16); }
base_char_to_int((char*)buf->addr);
memcpy(seq->a, buf->addr, base_size);
seq->n = base_size;
// test output
// int i; for (i = 0; i < seq->n; ++i) fprintf(stderr, "%d", kv_A(*seq, i)); fprintf(stderr, "\n");
--row_num_to_read;
++read_row_num;
}
return read_row_num;
2025-09-18 15:09:48 +08:00
}
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
if (h0 > 0)
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
else
fprintf(info_f, "%-8d%-8d\n", qlen, tlen);
#endif
2023-08-26 00:38:38 +08:00
}