439 lines
14 KiB
C
439 lines
14 KiB
C
/* The MIT License
|
||
|
||
Copyright (c) 2025- Zhonghai Zhang (ICT)
|
||
|
||
Permission is hereby granted, free of charge, to any person obtaining
|
||
a copy of this software and associated documentation files (the
|
||
"Software"), to deal in the Software without restriction, including
|
||
without limitation the rights to use, copy, modify, merge, publish,
|
||
distribute, sublicense, and/or sell copies of the Software, and to
|
||
permit persons to whom the Software is furnished to do so, subject to
|
||
the following conditions:
|
||
|
||
The above copyright notice and this permission notice shall be
|
||
included in all copies or substantial portions of the Software.
|
||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
SOFTWARE.
|
||
*/
|
||
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <time.h>
|
||
#include <unistd.h>
|
||
#include <zlib.h>
|
||
|
||
#include "bntseq.h"
|
||
#include "bwa.h"
|
||
#include "bwt.h"
|
||
#include "rle.h"
|
||
#include "rope.h"
|
||
#include "utils.h"
|
||
#include "kvec.h"
|
||
#include "hyb_idx.h"
|
||
#include "share_mem.h"
|
||
|
||
|
||
#ifdef _DIVBWT
|
||
#include "divsufsort.h"
|
||
#endif
|
||
|
||
#ifdef USE_MALLOC_WRAPPERS
|
||
#include "malloc_wrap.h"
|
||
#endif
|
||
|
||
int bwa_bwt2fullbytesa(int argc, char* argv[]) // the "bwt2bytesa" command
|
||
{
|
||
bwt_t* bwt;
|
||
if (optind + 2 > argc) {
|
||
fprintf(stderr, "Usage: %s bwt2bytesa <in.bwt> <out.bytesa>\n", PROG_NAME);
|
||
return 1;
|
||
}
|
||
bwt = bwt_restore_bwt(argv[optind]);
|
||
bwt_cal_byte_sa(bwt);
|
||
_store_data_to_file(argv[optind + 1], bwt->byte_sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3);
|
||
return 0;
|
||
}
|
||
|
||
// extract suffix array from non-sampled suffix array
|
||
int bwa_extract_sa(int argc, char* argv[]) {
|
||
bwt_t* bwt;
|
||
int c, sa_intv = 32;
|
||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||
switch (c) {
|
||
case 'i':
|
||
sa_intv = atoi(optarg);
|
||
break;
|
||
default:
|
||
return 1;
|
||
}
|
||
}
|
||
if (optind + 3 > argc) {
|
||
fprintf(stderr, "Usage: bwa extractsa [-i %d] <in.bwt> <in.full_bytesa> <out.sa>\n", sa_intv);
|
||
return 1;
|
||
}
|
||
bwt = bwt_restore_bwt(argv[optind]);
|
||
|
||
fprintf(stderr, "bwt seq_len %ld, intv: %d\n", bwt->seq_len, sa_intv);
|
||
//exit(0);
|
||
|
||
_load_file_to_data(argv[optind + 1], bwt->byte_sa);
|
||
|
||
if (bwt->sa) free(bwt->sa);
|
||
bwt->sa_intv = sa_intv;
|
||
bwt->n_sa = (bwt->seq_len + sa_intv) / sa_intv;
|
||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||
uint64_t i;
|
||
for (i = 0; i < bwt->seq_len; ++i) {
|
||
if (i % sa_intv == 0)
|
||
bwt->sa[i / sa_intv] = bwt_get_sa(bwt->byte_sa, i);
|
||
}
|
||
if (i % sa_intv == 0)
|
||
bwt->sa[i / sa_intv] = bwt_get_sa(bwt->byte_sa, i);
|
||
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
||
// bwt_cal_sa(bwt, sa_intv);
|
||
bwt_dump_sa(argv[optind + 2], bwt);
|
||
// bwt_destroy(bwt);
|
||
return 0;
|
||
}
|
||
|
||
// extract byte suffix array from non-sampled suffix array
|
||
int bwa_extract_byte_sa(int argc, char* argv[]) {
|
||
bwt_t* bwt;
|
||
int c, sa_intv = 32;
|
||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||
switch (c) {
|
||
case 'i':
|
||
sa_intv = atoi(optarg);
|
||
break;
|
||
default:
|
||
return 1;
|
||
}
|
||
}
|
||
if (optind + 3 > argc) {
|
||
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n\n", sa_intv);
|
||
return 1;
|
||
}
|
||
bwt = bwt_restore_bwt(argv[optind]);
|
||
|
||
// fprintf(stderr, "bwt seq_len %ld, intv: %d\n", bwt->seq_len, sa_intv);
|
||
// exit(0);
|
||
|
||
uint8_t *full_byte_sa = NULL;
|
||
_load_file_to_data(argv[optind + 1], full_byte_sa);
|
||
|
||
bwt->sa_intv = sa_intv;
|
||
bwt->n_sa = (bwt->seq_len + sa_intv) / sa_intv;
|
||
bwt->byte_sa = (uint8_t*)calloc(SA_BYTES(bwt->n_sa), 1);
|
||
uint64_t i;
|
||
for (i = 0; i < bwt->seq_len; ++i) {
|
||
if (i % sa_intv == 0)
|
||
bwt_set_sa(bwt->byte_sa, i / sa_intv, bwt_get_sa(full_byte_sa, i));
|
||
}
|
||
if (i % sa_intv == 0)
|
||
bwt_set_sa(bwt->byte_sa, i / sa_intv, bwt_get_sa(full_byte_sa, i));
|
||
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
|
||
bwt_dump_byte_sa(argv[optind + 2], bwt);
|
||
// bwt_destroy(bwt);
|
||
return 0;
|
||
}
|
||
|
||
// 生成所有KMER_LEN长度的序列,字节表示(0A,1C,2G,3T)
|
||
void gen_kmer_bits(uint8_t** seq_arr, uint64_t* kmer_arr_size, int kmer_len) {
|
||
uint64_t i;
|
||
uint64_t seq_up_val = (1L << (kmer_len << 1));
|
||
*kmer_arr_size = seq_up_val;
|
||
*seq_arr = (uint8_t*)realloc(*seq_arr, seq_up_val * (uint64_t)kmer_len);
|
||
for (i = 0; i < seq_up_val; ++i) {
|
||
const uint64_t base_idx = i * kmer_len;
|
||
for (int j = kmer_len - 1; j >= 0; --j) {
|
||
(*seq_arr)[base_idx + kmer_len - 1 - j] = (i >> (j << 1)) & 3;
|
||
}
|
||
}
|
||
}
|
||
|
||
void bwt_create_kmer_idx(bwt_t* bwt) {
|
||
uint64_t kmer_arr_size = 0;
|
||
uint8_t* seq_arr = 0;
|
||
bwtintv_t ik, ok[4];
|
||
uint64_t j;
|
||
int i, c1;
|
||
int qlen = 10;
|
||
KmerHash* kh = &bwt->kmer_hash;
|
||
|
||
kh->ke10 = (KmerEntryArr*)malloc((1 << (10 << 1)) * sizeof(KmerEntryArr));
|
||
kh->ke11 = (KmerEntry*)malloc((1 << (11 << 1)) * sizeof(KmerEntry));
|
||
kh->ke12 = (KmerEntry*)malloc((1 << (12 << 1)) * sizeof(KmerEntry));
|
||
kh->ke13 = (KmerEntry*)malloc((1 << (13 << 1)) * sizeof(KmerEntry));
|
||
kh->ke14 = (KmerEntry*)malloc((1 << (14 << 1)) * sizeof(KmerEntry));
|
||
|
||
// 长度为10的kmer
|
||
gen_kmer_bits(&seq_arr, &kmer_arr_size, 10);
|
||
for (j = 0; j < kmer_arr_size; ++j) {
|
||
uint8_t* q = &seq_arr[j * 10];
|
||
uint8_t* mem_addr = kh->ke10[j].intv_arr;
|
||
bwt_set_intv(bwt, q[0], ik);
|
||
kmer_setval_at(mem_addr, ik, 0);
|
||
// 每次扩展一个碱基
|
||
for (i = 1; i < qlen; ++i) {
|
||
c1 = 3 - q[i];
|
||
bwt_extend(bwt, &ik, ok, 0);
|
||
ik = ok[c1];
|
||
kmer_setval_at(mem_addr, ik, i);
|
||
}
|
||
}
|
||
fprintf(stderr, "Kmer idx for length 10 done.\n");
|
||
#define GEN_KMER_FOR_LEN(len) \
|
||
do { \
|
||
gen_kmer_bits(&seq_arr, &kmer_arr_size, (len)); \
|
||
for (j = 0; j < kmer_arr_size; ++j) { \
|
||
bwtintv_t p = bwt_forward_search(bwt, &seq_arr[j * (len)], (len)); \
|
||
kmer_setval_at(bwt->kmer_hash.ke##len[j].intv_arr, p, 0); \
|
||
} \
|
||
fprintf(stderr, "Kmer idx for length %d done.\n", len); \
|
||
} while (0)
|
||
|
||
GEN_KMER_FOR_LEN(11); // 长度11的kmer
|
||
GEN_KMER_FOR_LEN(12);
|
||
GEN_KMER_FOR_LEN(13);
|
||
GEN_KMER_FOR_LEN(14);
|
||
free(seq_arr);
|
||
}
|
||
|
||
// 从bwt中创建kmer index,并保存到文件
|
||
int bwa_bwt2kmer(int argc, char* argv[]) {
|
||
bwt_t* bwt;
|
||
if (optind + 2 > argc) {
|
||
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n\n");
|
||
return 1;
|
||
}
|
||
bwt = bwt_restore_bwt(argv[optind]);
|
||
bwt_create_kmer_idx(bwt);
|
||
bwt_dump_kmer_idx(argv[optind + 1], &bwt->kmer_hash);
|
||
return 0;
|
||
}
|
||
|
||
// 将原始的pac转换一下,从低到高存储
|
||
void convert_to_hyb_pac(uint8_t* old_pac, uint64_t l_pac, const char* new_pac_fn) {
|
||
#define _gp(l) ((old_pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
|
||
const uint64_t kPacByteNum = l_pac / 4 + 1;
|
||
uint8_t* pac = (uint8_t*)calloc(l_pac, 1);
|
||
FILE* pacFp = fopen(new_pac_fn, "wb");
|
||
uint8_t byte_bases = 0;
|
||
uint64_t i = 0;
|
||
uint8_t* p1;
|
||
for (; i + 3 < l_pac; i += 4) {
|
||
p1 = pac + (i >> 2);
|
||
byte_bases = _gp(i) | (_gp(i + 1) << 2) | (_gp(i + 2) << 4) | (_gp(i + 3) << 6);
|
||
*p1 = byte_bases;
|
||
}
|
||
byte_bases = 0;
|
||
p1 = pac + (i >> 2);
|
||
for (uint32_t j = 0; i < l_pac; ++i, ++j) {
|
||
byte_bases |= _gp(i) << j * 2;
|
||
}
|
||
*p1 = byte_bases;
|
||
|
||
fwrite(pac, 1, kPacByteNum, pacFp);
|
||
uint8_t ct = 0;
|
||
if (l_pac % 4 == 0) {
|
||
ct = 0;
|
||
err_fwrite(&ct, 1, 1, pacFp);
|
||
}
|
||
ct = l_pac % 4;
|
||
err_fwrite(&ct, 1, 1, pacFp);
|
||
fclose(pacFp);
|
||
}
|
||
|
||
// 将原pac文件转为hyb需要的格式(翻转byte)
|
||
int bwa_pac2hybpac(int argc, char* argv[]) {
|
||
if (optind + 1 > argc) {
|
||
fprintf(stderr, "Usage: bwa pac2hybpac <in.prefix>\n\n");
|
||
return 1;
|
||
}
|
||
char fn[MAX_PATH];
|
||
FILE* fp;
|
||
uint8_t* old_pac = NULL;
|
||
uint64_t l_pac = 0;
|
||
// fprintf(stderr, "here-1\n");
|
||
snprintf(fn, MAX_PATH, "%s.pac", argv[optind]);
|
||
_load_file_to_data(fn, old_pac);
|
||
sprintf(fn, "%s.ref-len", argv[optind]);
|
||
fp = xopen(fn, "r");
|
||
err_check_false(fscanf(fp, "%ld", &l_pac), EOF);
|
||
err_fclose(fp);
|
||
sprintf(fn, "%s.hyb.pac", argv[optind]);
|
||
// fprintf(stderr, "here-2\n");
|
||
convert_to_hyb_pac(old_pac, l_pac, fn);
|
||
return 0;
|
||
}
|
||
|
||
// 创建hybrid index,并保存到文件
|
||
int bwa_bwt2hyb(int argc, char* argv[]) {
|
||
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix);
|
||
bwt_t* bwt;
|
||
int num_threads = 1;
|
||
char c;
|
||
int error = 0;
|
||
while ((c = getopt(argc, argv, "t:")) >= 0) {
|
||
switch (c) {
|
||
case 't':
|
||
num_threads = atoi(optarg);
|
||
assert(num_threads > 0 && num_threads < 512);
|
||
break;
|
||
default:
|
||
error = 1;
|
||
break;
|
||
}
|
||
}
|
||
if (optind + 1 > argc || error) {
|
||
fprintf(stderr, "Usage: bwa bwt2hyb [Options] <in.prefix>\n\n");
|
||
fprintf(stderr, "Options: -t INT number of threads for hybrid index building [%d]\n", num_threads);
|
||
fprintf(stderr, "\n");
|
||
return 1;
|
||
}
|
||
char fn[MAX_PATH];
|
||
snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]);
|
||
bwt = bwt_restore_bwt(fn);
|
||
snprintf(fn, MAX_PATH, "%s.kmer", argv[optind]);
|
||
bwt->kmer_hash = bwt_restore_kmer_idx(fn);
|
||
snprintf(fn, MAX_PATH, "%s.hyb.bytesa", argv[optind]);
|
||
_load_file_to_data(fn, bwt->byte_sa);
|
||
hyb_idx_build_and_dump(num_threads, bwt, argv[optind]);
|
||
return 0;
|
||
}
|
||
|
||
// 尝试从share memory中加载hybrid index
|
||
HybridIndex* bwa_hyb_idx_load_from_shm(const char* idx_prefix) {
|
||
char fn[MAX_PATH];
|
||
uint8_t* ref_bits = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_PAC_SUFFIX));
|
||
uint8_t* sa = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_SA_SUFFIX));
|
||
uint8_t* kmer_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_KMER_SUFFIX));
|
||
uint8_t* index_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_DATA_SUFFIX));
|
||
if (!ref_bits || !sa || !kmer_data || !index_data) {
|
||
return NULL;
|
||
}
|
||
HybridIndex* hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex));
|
||
hyb->ref_bits = ref_bits;
|
||
hyb->sa = sa;
|
||
hyb->kmer_data = kmer_data;
|
||
hyb->index_data = index_data;
|
||
return hyb;
|
||
}
|
||
|
||
// 从硬盘中加载hybrid index
|
||
HybridIndex* bwa_hyb_idx_load_from_disk(const char* idx_prefix) {
|
||
char fn[MAX_PATH];
|
||
FILE* fp = NULL;
|
||
struct stat st;
|
||
double sec_time;
|
||
|
||
#define __load_hybrid_idx_code(suffix, data) \
|
||
sec_time = realtime(); \
|
||
sprintf(fn, "%s%s", idx_prefix, suffix); \
|
||
err_check_true(stat(fn, &st), 0, fn); \
|
||
fp = xopen(fn, "r"); \
|
||
data = (uint8_t*)malloc(st.st_size); \
|
||
err_fread_noeof(data, 1, st.st_size, fp); \
|
||
err_fclose(fp); \
|
||
fprintf(stderr, "%s, %0.2f GB, %0.2f s\n", fn, (double)st.st_size / 1024 / 1024 / 1024, realtime() - sec_time);
|
||
|
||
HybridIndex* hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex));
|
||
|
||
__load_hybrid_idx_code(HYB_PAC_SUFFIX, hyb->ref_bits);
|
||
// load hyb byte-sa
|
||
__load_hybrid_idx_code(HYB_SA_SUFFIX, hyb->sa);
|
||
// load hyb kmer data
|
||
__load_hybrid_idx_code(HYB_KMER_SUFFIX, hyb->kmer_data);
|
||
// load hyb index data
|
||
__load_hybrid_idx_code(HYB_DATA_SUFFIX, hyb->index_data);
|
||
|
||
return hyb;
|
||
}
|
||
|
||
// 在共享内存中处理hybrid index
|
||
int main_shm_hyb(int argc, char* argv[]) {
|
||
char c;
|
||
int clear_shm = 0;
|
||
int list_shm = 0;
|
||
int error = 0;
|
||
while ((c = getopt(argc, argv, "dl")) >= 0) {
|
||
switch (c) {
|
||
case 'd':
|
||
clear_shm = 1;
|
||
break;
|
||
case 'l':
|
||
list_shm = 1;
|
||
break;
|
||
default:
|
||
error = 1;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// fprintf(stderr, "%d %d\n", optind, argc);
|
||
|
||
if ((optind == argc && !clear_shm && !list_shm) || error) {
|
||
fprintf(stderr, "Usage: bwa hybshm [-d|-l] [idx_prefix]\n\n");
|
||
fprintf(stderr, "Options: -d destroy all hyb indices in shared memory\n");
|
||
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||
fprintf(stderr, "\n");
|
||
return 1;
|
||
}
|
||
if (list_shm) {
|
||
return list_shm_hyb_indices();
|
||
} else if (clear_shm) {
|
||
return shm_clear_hyb();
|
||
}
|
||
return shm_keep_hyb(argv[optind]);
|
||
}
|
||
|
||
////////////////////////////////////////////// for test /////////////////////////////////////
|
||
|
||
// 创建正向的kmer
|
||
uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
|
||
uint64_t qbit = 0, i;
|
||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||
for (i = 0; i < qlen; ++i) {
|
||
if (q[i] > 3)
|
||
break; // 要考虑碱基是N
|
||
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
|
||
}
|
||
*base_consumed = i;
|
||
return qbit;
|
||
}
|
||
|
||
int hyb_test(int argc, char* argv[]) {
|
||
bwt_t* bwt;
|
||
if (optind + 1 > argc) {
|
||
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n\n");
|
||
return 1;
|
||
}
|
||
|
||
char fn[MAX_PATH];
|
||
snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]);
|
||
|
||
bwt = bwt_restore_bwt(fn);
|
||
snprintf(fn, MAX_PATH, "%s.hyb.kmer", argv[optind]);
|
||
bwt->kmer_hash = bwt_restore_kmer_idx(fn);
|
||
|
||
uint8_t test_seq[14] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1}; // ACGTACGTACGTAC
|
||
bwtintv_t ok1, ok2;
|
||
ok1 = bwt_forward_search(bwt, test_seq, 14);
|
||
|
||
int base_consumed = 0;
|
||
uint64_t qbit = build_forward_kmer(test_seq, 14, 14, &base_consumed);
|
||
|
||
bwt_kmer_get(&bwt->kmer_hash, &ok2, qbit, 13);
|
||
|
||
fprintf(stderr, "forward search: [%lu, %lu, %lu), kmer idx: [%lu, %lu, %lu)\n", ok1.x[0], ok1.x[1], ok1.x[2], ok2.x[0], ok2.x[1], ok2.x[2]);
|
||
|
||
return 0;
|
||
} |