sw_perf/utils.c

141 lines
5.4 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*********************************************************************************************
Description: Some useful functions
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/25
***********************************************************************************************/
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "utils.h"
#include "debug.h"
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
const char *BASE = "ACGTN";
// 将碱基字符转成2位编码
void base_char_to_int(char *str)
{
int i;
const int slen = strlen(str);
for (i = 0; i < slen; ++i)
{
str[i] = nst_nt4_table[(uint8_t)str[i]];
}
}
/* read row_num_to_read lines of query-target info */
/**
* info_num: numbers of int value in one line of info file (2: qlen, tlen or 3: qlen, tlen, init_score)
*/
int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, FILE *fp)
{
if (!qti_arr || !buf || !fp) return -1;
if (info_num !=2 && info_num != 3) return -1;
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
int read_row_num = 0;
while (row_num_to_read > 0 && fgets((char *)buf->addr, buf->m, fp) != NULL)
{
// if (strlen((char *)buf->addr) == 1) continue; // skip null line
if (qti_arr->m <= read_row_num) { // expend the seq array
int old_m = qti_arr->m;
kv_resize(qti_t, *qti_arr, old_m == 0 ? 128 : old_m * 2);
memset(&qti_arr->a[old_m], 0, (qti_arr->m - old_m) * sizeof(qti_t)); // set the newly allocated mem to zero
}
qti_t *qti = &kv_A(*qti_arr, qti_arr->n++); // get a new query-target info
if (qti->m < info_num) { kv_resize(int, *qti, info_num); }
if (info_num == 2) sscanf((char *)buf->addr, "%d %d", &qti->a[0], &qti->a[1]);
else sscanf((char *)buf->addr, "%d %d %d", &qti->a[0], &qti->a[1], &qti->a[2]);
// test output
// int i; for (i = 0; i < info_num; ++i) { fprintf(stderr, "%d\t", kv_A(*qti, i)); } fprintf(stderr, "\n");
--row_num_to_read;
++read_row_num;
}
return read_row_num;
}
/* read row_num_to_read lines of query or target sequence */
int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp)
{
if (!seq_arr || !buf || !fp) return -1;
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
int read_row_num = 0;
while (row_num_to_read > 0 && fgets((char*)buf->addr, buf->m, fp) != NULL)
{
int base_size = strlen((char *)buf->addr);
// trim the '\n' char
if ((char)buf->addr[base_size - 1] == '\n') {
buf->addr[base_size - 1] = (uint8_t)'\0';
--base_size;
}
// if (base_size == 0) continue; // skip null line
// fprintf(stderr, "%d %s\n", base_size, (char *) buf->addr);
// test output
// fprintf(stderr, "%s\n", (char *)buf->addr);
if (seq_arr->m <= read_row_num) { // expend the seq array
int old_m = seq_arr->m;
kv_resize(seq_t, *seq_arr, old_m == 0 ? 128 : old_m * 2);
memset(&seq_arr->a[old_m], 0, (seq_arr->m - old_m) * sizeof(seq_t)); // set the newly allocated mem to zero
}
seq_t *seq = &kv_A(*seq_arr, seq_arr->n++); // get a new query-target sequence
if (seq->m < base_size) { kv_resize(uint8_t, *seq, base_size + 16); }
base_char_to_int((char*)buf->addr);
memcpy(seq->a, buf->addr, base_size);
seq->n = base_size;
// test output
// int i; for (i = 0; i < seq->n; ++i) fprintf(stderr, "%d", kv_A(*seq, i)); fprintf(stderr, "\n");
--row_num_to_read;
++read_row_num;
}
return read_row_num;
}
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
if (h0 > 0)
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
else
fprintf(info_f, "%-8d%-8d\n", qlen, tlen);
#endif
}