/********************************************************************************************* Description: Some useful functions Copyright : All right reserved by NCIC.ICT Author : Zhang Zhonghai Date : 2023/08/25 ***********************************************************************************************/ #include #include #include #include #include "utils.h" #include "debug.h" unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; const char *BASE = "ACGTN"; // 将碱基字符转成2位编码 void base_char_to_int(char *str) { int i; const int slen = strlen(str); for (i = 0; i < slen; ++i) { str[i] = nst_nt4_table[(uint8_t)str[i]]; } } /* read row_num_to_read lines of query-target info */ /** * info_num: numbers of int value in one line of info file (2: qlen, tlen or 3: qlen, tlen, init_score) */ int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, FILE *fp) { if (!qti_arr || !buf || !fp) return -1; if (info_num !=2 && info_num != 3) return -1; if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); } int read_row_num = 0; while (row_num_to_read > 0 && fgets((char *)buf->addr, buf->m, fp) != NULL) { // if (strlen((char *)buf->addr) == 1) continue; // skip null line if (qti_arr->m <= read_row_num) { // expend the seq array int old_m = qti_arr->m; kv_resize(qti_t, *qti_arr, old_m == 0 ? 128 : old_m * 2); memset(&qti_arr->a[old_m], 0, (qti_arr->m - old_m) * sizeof(qti_t)); // set the newly allocated mem to zero } qti_t *qti = &kv_A(*qti_arr, qti_arr->n++); // get a new query-target info if (qti->m < info_num) { kv_resize(int, *qti, info_num); } if (info_num == 2) sscanf((char *)buf->addr, "%d %d", &qti->a[0], &qti->a[1]); else sscanf((char *)buf->addr, "%d %d %d", &qti->a[0], &qti->a[1], &qti->a[2]); // test output // int i; for (i = 0; i < info_num; ++i) { fprintf(stderr, "%d\t", kv_A(*qti, i)); } fprintf(stderr, "\n"); --row_num_to_read; ++read_row_num; } return read_row_num; } /* read row_num_to_read lines of query or target sequence */ int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp) { if (!seq_arr || !buf || !fp) return -1; if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); } int read_row_num = 0; while (row_num_to_read > 0 && fgets((char*)buf->addr, buf->m, fp) != NULL) { int base_size = strlen((char *)buf->addr); // trim the '\n' char if ((char)buf->addr[base_size - 1] == '\n') { buf->addr[base_size - 1] = (uint8_t)'\0'; --base_size; } // if (base_size == 0) continue; // skip null line // fprintf(stderr, "%d %s\n", base_size, (char *) buf->addr); // test output // fprintf(stderr, "%s\n", (char *)buf->addr); if (seq_arr->m <= read_row_num) { // expend the seq array int old_m = seq_arr->m; kv_resize(seq_t, *seq_arr, old_m == 0 ? 128 : old_m * 2); memset(&seq_arr->a[old_m], 0, (seq_arr->m - old_m) * sizeof(seq_t)); // set the newly allocated mem to zero } seq_t *seq = &kv_A(*seq_arr, seq_arr->n++); // get a new query-target sequence if (seq->m < base_size) { kv_resize(uint8_t, *seq, base_size + 16); } base_char_to_int((char*)buf->addr); memcpy(seq->a, buf->addr, base_size); seq->n = base_size; // test output // int i; for (i = 0; i < seq->n; ++i) fprintf(stderr, "%d", kv_A(*seq, i)); fprintf(stderr, "\n"); --row_num_to_read; ++read_row_num; } return read_row_num; } void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum) { #ifdef DEBUG_FILE_OUTPUT // 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen FILE *query_f = gfq[fnum], *target_f = gft[fnum], *info_f = gfi[fnum]; const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'}; int i; // 处理query for (i = 0; i < qlen; ++i) fprintf(query_f, "%c", seq_map[query[i]]); fprintf(query_f, "\n"); // 处理target for (i = 0; i < tlen; ++i) fprintf(target_f, "%c", seq_map[target[i]]); fprintf(target_f, "\n"); // 处理其他信息 if (h0 > 0) fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0); else fprintf(info_f, "%-8d%-8d\n", qlen, tlen); #endif }