添加了创建byte_sa和kmer index的代码
This commit is contained in:
parent
b5ee622884
commit
74b464ee4a
|
|
@ -5,7 +5,13 @@ test64
|
|||
.*.swp
|
||||
Makefile.bak
|
||||
bwamem-lite
|
||||
|
||||
test_index/
|
||||
index/
|
||||
orig_index/
|
||||
run.sh
|
||||
debug.sh
|
||||
hybalign
|
||||
*.sam
|
||||
# ---> C
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
|
|
|||
8
Makefile
8
Makefile
|
|
@ -4,13 +4,15 @@ CFLAGS= -g -Wall -Wno-unused-function -O3
|
|||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||
HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
|
||||
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
|
||||
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
bwape.o kopen.o pemerge.o maxk.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o \
|
||||
$(HYBOBJS)
|
||||
PROG= hybalign
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread
|
||||
SUBDIRS= .
|
||||
|
|
@ -34,7 +36,7 @@ endif
|
|||
|
||||
all:$(PROG)
|
||||
|
||||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(PROG):libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
bwamem-lite:libbwa.a example.o
|
||||
|
|
|
|||
2
bwa.c
2
bwa.c
|
|
@ -280,7 +280,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
tmp = calloc(strlen(prefix) + 5, 1);
|
||||
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
||||
bwt = bwt_restore_bwt(tmp);
|
||||
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
||||
strcat(strcpy(tmp, prefix), ".32.sa"); // partial suffix array (SA)
|
||||
bwt_restore_sa(tmp, bwt);
|
||||
free(tmp); free(prefix);
|
||||
return bwt;
|
||||
|
|
|
|||
213
bwt.c
213
bwt.c
|
|
@ -467,3 +467,216 @@ void bwt_destroy(bwt_t *bwt)
|
|||
free(bwt->sa); free(bwt->bwt);
|
||||
free(bwt);
|
||||
}
|
||||
|
||||
////////////////////////////////// new funcs for hyb-align ////////////////////////////////
|
||||
|
||||
// 设置某一行的排序值,sa的索引有效值从1开始,(0设置为-1, 小端模式)
|
||||
void bwt_set_sa(uint8_t* sa_arr, bwtint_t k, bwtint_t val) {
|
||||
// const bwtint_t block_idx = (k >> 3) * 33;
|
||||
const bwtint_t block_idx = ((k >> 3) << 5) + (k >> 3); // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t* sa_addr = (bwtint_t*)(sa_arr + start_byte_idx);
|
||||
// *sa_addr &= (1 << val_idx_in_block) - 1; //
|
||||
// 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值
|
||||
*sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block;
|
||||
}
|
||||
|
||||
// 获取某一行的排序值(小端模式)
|
||||
bwtint_t bwt_get_sa(uint8_t* sa_arr, bwtint_t k) {
|
||||
// const bwtint_t block_idx = (k >> 3) * 33;
|
||||
const bwtint_t block_idx = ((k >> 3) << 5) + (k >> 3); // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t val = *(bwtint_t*)(sa_arr + start_byte_idx);
|
||||
val = (val >> val_idx_in_block) & 8589934591;
|
||||
return val;
|
||||
}
|
||||
|
||||
/*
|
||||
* bwtint_t bwt_get_sa(uint8_t* sa_arr, bwtint_t k) {
|
||||
* const bwtint_t start_byte = ((k << 5) + k) >> 3; // 存储这个sa数据的起始字节
|
||||
* bwtint_t val = *(bwtint_t*)(sa_arr + start_byte);
|
||||
* val = (val >> (k & 7)) & 8589934591;
|
||||
* return val;
|
||||
* }
|
||||
*/
|
||||
|
||||
// 第一次创建索引时,创建不采样的SA(24G)和采样间隔为4的SA(6G)
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
void bwt_cal_byte_sa(bwt_t* bwt) {
|
||||
bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引,sa表示位置
|
||||
double tmp_time, elapsed_time;
|
||||
int intv = 1;
|
||||
int intv_round = intv; // 间隔多少来保存一个位置信息
|
||||
|
||||
kv_roundup32(intv_round);
|
||||
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
||||
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
||||
|
||||
if (bwt->byte_sa)
|
||||
free(bwt->byte_sa);
|
||||
bwt->sa_intv = intv;
|
||||
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
||||
bwt->byte_sa = (uint8_t*)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置
|
||||
|
||||
fprintf(stderr, "bytes: %ld, sa size: %ld\n", SA_BYTES(bwt->n_sa), bwt->n_sa);
|
||||
// calculate SA value
|
||||
isa = 0;
|
||||
sa = bwt->seq_len;
|
||||
block_size = bwt->seq_len / 100;
|
||||
tmp_time = realtime();
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
if (i % block_size == 0) {
|
||||
elapsed_time = realtime() - tmp_time;
|
||||
fprintf(stderr, "%ld%% percent complished. %f s elapsed.\n", i / block_size, elapsed_time);
|
||||
}
|
||||
if (isa % intv == 0) {
|
||||
bwt_set_sa(bwt->byte_sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度
|
||||
if (i % (block_size / 2) == 0) {
|
||||
fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->byte_sa, isa / intv)); // 验证结果正确性
|
||||
}
|
||||
}
|
||||
--sa; // 从后往前,一个位置一个位置的找对应的后缀数组,isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
|
||||
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
|
||||
}
|
||||
if (isa % intv == 0)
|
||||
bwt_set_sa(bwt->byte_sa, isa / intv, sa);
|
||||
// bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了
|
||||
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // 2的33次方-1 before this line, bwt->sa[0] = bwt->seq_len
|
||||
}
|
||||
|
||||
// 保存byte_sa
|
||||
void bwt_dump_byte_sa(const char* fn, const bwt_t* bwt) {
|
||||
FILE* fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->byte_sa, 1, SA_BYTES(bwt->n_sa), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 设置kmer第pos个碱基对应的fmt匹配信息
|
||||
void kmer_setval_at(uint8_t* mem_addr, bwtintv_t ik, int pos) {
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t* arr = mem_addr + byte_idx;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t*)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t*)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t*)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
// 获取kmer的bwt匹配信息
|
||||
void kmer_getval_at(uint8_t* mem_addr, bwtintv_t* ok, int pos) {
|
||||
bwtint_t x0, x1, x2;
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t* arr = mem_addr + byte_idx;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t*)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t*)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t*)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
}
|
||||
|
||||
// 获取kmer对应的fmt匹配信息, pos should be [0, 13], qbit一定是14个碱基对应的编码, pos是指在14个碱基中的位置
|
||||
void bwt_kmer_get(const KmerHash* kmer_hash, bwtintv_t* ok, uint32_t qbit, int pos) {
|
||||
#if HASH_KMER_LEN == 14
|
||||
if (pos == 13)
|
||||
kmer_getval_at(kmer_hash->ke14[qbit].intv_arr, ok, 0);
|
||||
else if (pos == 12)
|
||||
kmer_getval_at(kmer_hash->ke13[qbit >> 2].intv_arr, ok, 0);
|
||||
else if (pos == 11)
|
||||
kmer_getval_at(kmer_hash->ke12[qbit >> 4].intv_arr, ok, 0);
|
||||
else if (pos == 10) // 其实对应kmer长度为10
|
||||
kmer_getval_at(kmer_hash->ke11[qbit >> 6].intv_arr, ok, 0);
|
||||
else
|
||||
kmer_getval_at(kmer_hash->ke10[qbit >> 8].intv_arr, ok, pos);
|
||||
#elif HASH_KMER_LEN == 13
|
||||
if (pos == 12)
|
||||
kmer_getval_at(kmer_hash->ke13[qbit].intv_arr, ok, 0);
|
||||
else if (pos == 11)
|
||||
kmer_getval_at(kmer_hash->ke12[qbit >> 2].intv_arr, ok, 0);
|
||||
else if (pos == 10)
|
||||
kmer_getval_at(kmer_hash->ke11[qbit >> 4].intv_arr, ok, 0);
|
||||
else
|
||||
kmer_getval_at(kmer_hash->ke10[qbit >> 6].intv_arr, ok, pos);
|
||||
#else
|
||||
if (pos == 11)
|
||||
kmer_getval_at(kmer_hash->ke12[qbit].intv_arr, ok, 0);
|
||||
else if (pos == 10)
|
||||
kmer_getval_at(kmer_hash->ke11[qbit >> 2].intv_arr, ok, 0);
|
||||
else
|
||||
kmer_getval_at(kmer_hash->ke10[qbit >> 4].intv_arr, ok, pos);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 只是用来计算kmer-index,所以q中只包含A/C/G/T,不包含N
|
||||
bwtintv_t bwt_forward_search(bwt_t* bwt, const uint8_t* q, int qlen) {
|
||||
bwtintv_t ik, ok[4];
|
||||
int i, c1;
|
||||
bwt_set_intv(bwt, q[0], ik);
|
||||
// 每次扩展一个碱基
|
||||
for (i = 1; i < qlen; ++i) {
|
||||
c1 = 3 - q[i];
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
ik = ok[c1];
|
||||
}
|
||||
ik.info = qlen;
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 将kmer hash数据写入到文件
|
||||
void bwt_dump_kmer_idx(const char* fn, const KmerHash* kh) {
|
||||
FILE* fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(kh->ke10, 1, (1 << (10 << 1)) * sizeof(KmerEntryArr), fp);
|
||||
err_fwrite(kh->ke11, 1, (1 << (11 << 1)) * sizeof(KmerEntry), fp);
|
||||
err_fwrite(kh->ke12, 1, (1 << (12 << 1)) * sizeof(KmerEntry), fp);
|
||||
#if HASH_KMER_LEN > 12
|
||||
err_fwrite(kh->ke13, 1, (1 << (13 << 1)) * sizeof(KmerEntry), fp);
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
err_fwrite(kh->ke14, 1, (1 << (14 << 1)) * sizeof(KmerEntry), fp);
|
||||
#endif
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 从文件中读取kmer hash信息
|
||||
KmerHash bwt_restore_kmer_idx(const char* fn) {
|
||||
FILE* fp = xopen(fn, "rb");
|
||||
KmerHash khash;
|
||||
KmerHash* kh = &khash;
|
||||
|
||||
#define LOAD_KMER_HASH(kmer_len) \
|
||||
do { \
|
||||
len = 1 << (kmer_len << 1); \
|
||||
kh->ke##kmer_len = (KmerEntry*)malloc(len * sizeof(KmerEntry)); \
|
||||
fread_fix(fp, len * sizeof(KmerEntry), kh->ke##kmer_len); \
|
||||
} while (0)
|
||||
|
||||
int len = 1 << (10 << 1);
|
||||
kh->ke10 = (KmerEntryArr*)malloc(len * sizeof(KmerEntryArr));
|
||||
fread_fix(fp, len * sizeof(KmerEntryArr), kh->ke10);
|
||||
LOAD_KMER_HASH(11);
|
||||
LOAD_KMER_HASH(12);
|
||||
#if HASH_KMER_LEN > 12
|
||||
LOAD_KMER_HASH(13);
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
LOAD_KMER_HASH(14);
|
||||
#endif
|
||||
err_fclose(fp);
|
||||
return khash;
|
||||
}
|
||||
39
bwt.h
39
bwt.h
|
|
@ -45,6 +45,28 @@ typedef unsigned char ubyte_t;
|
|||
|
||||
typedef uint64_t bwtint_t;
|
||||
|
||||
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8) /* 8字节对齐*/
|
||||
#define HASH_KMER_LEN 14
|
||||
|
||||
// 用来保存kmer对应的bwt的位置信息
|
||||
typedef struct {
|
||||
// 40+40+32 14个byte,这样好处理
|
||||
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
||||
} KmerEntry;
|
||||
|
||||
typedef struct {
|
||||
uint8_t intv_arr[140]; // 保存长度为10的kmer,每个碱基对应的bwt匹配信息
|
||||
} KmerEntryArr;
|
||||
|
||||
// 保存各个位置对应的bwt匹配信息
|
||||
typedef struct {
|
||||
KmerEntryArr* ke10;
|
||||
KmerEntry* ke11;
|
||||
KmerEntry* ke12;
|
||||
KmerEntry* ke13;
|
||||
KmerEntry* ke14;
|
||||
} KmerHash;
|
||||
|
||||
typedef struct {
|
||||
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
|
||||
bwtint_t L2[5]; // C(), cumulative count
|
||||
|
|
@ -57,6 +79,8 @@ typedef struct {
|
|||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
bwtint_t *sa;
|
||||
uint8_t* byte_sa; // byte-based Suffix-Array
|
||||
KmerHash kmer_hash; // 保存kmer对应的bwt位置信息
|
||||
} bwt_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -125,8 +149,21 @@ extern "C" {
|
|||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||
|
||||
|
||||
// new funcs for hyb-align
|
||||
void bwt_cal_byte_sa(bwt_t* bwt);
|
||||
void bwt_set_sa(uint8_t* sa_arr, bwtint_t k, bwtint_t val);
|
||||
bwtint_t bwt_get_sa(uint8_t* sa_arr, bwtint_t k);
|
||||
void kmer_getval_at(uint8_t* mem_addr, bwtintv_t* ok, int pos);
|
||||
void kmer_setval_at(uint8_t* mem_addr, bwtintv_t ik, int pos);
|
||||
void bwt_kmer_get(const KmerHash* kmer_hash, bwtintv_t* ok, uint32_t qbit, int pos);
|
||||
void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt);
|
||||
bwtintv_t bwt_forward_search(bwt_t* bwt, const uint8_t* q, int qlen);
|
||||
void bwt_dump_kmer_idx(const char* fn, const KmerHash* kh);
|
||||
KmerHash bwt_restore_kmer_idx(const char* fn);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
10
bwtindex.c
10
bwtindex.c
|
|
@ -269,6 +269,16 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
|
|||
t = clock();
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
||||
uint64_t seq_len = l_pac >> 1;
|
||||
strcpy(str, prefix); strcat(str, ".ref-len");
|
||||
FILE* f_len = fopen(str, "w");
|
||||
fprintf(f_len, "%lu\n", seq_len);
|
||||
fclose(f_len);
|
||||
memset(str, 0, strlen(prefix) + 10);
|
||||
|
||||
//fprintf(stderr, "len: %ld, %ld\n", l_pac, seq_len);
|
||||
//exit(0);
|
||||
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
err_gzclose(fp);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
/*********************************************************************************************
|
||||
Description: data and files for debugging
|
||||
Copyright : All right reserved by NCIC.ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2024/04/09
|
||||
***********************************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include "debug.h"
|
||||
|
||||
FILE *gf[4] = {0},
|
||||
*gfq[4] = {0},
|
||||
*gft[4] = {0},
|
||||
*gfi[4] = {0};
|
||||
|
||||
uint64_t debug_num = 0;
|
||||
|
||||
uint64_t all_match_len = 0;
|
||||
uint64_t all_seq_num = 0;
|
||||
uint64_t all_type_hits = 0;
|
||||
double seed_time = 0;
|
||||
|
||||
int open_qti_files()
|
||||
{
|
||||
char fn[1024] = {0};
|
||||
int i = 0;
|
||||
for (; i < 4; ++i)
|
||||
{
|
||||
sprintf(fn, "./output/q%d.txt", i);
|
||||
gfq[i] = fopen(fn, "w");
|
||||
sprintf(fn, "./output/t%d.txt", i);
|
||||
gft[i] = fopen(fn, "w");
|
||||
sprintf(fn, "./output/i%d.txt", i);
|
||||
gfi[i] = fopen(fn, "w");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int open_debug_files()
|
||||
{
|
||||
char fn[1024] = {0};
|
||||
int i = 0;
|
||||
for (; i < 4; ++i)
|
||||
{
|
||||
sprintf(fn, "./output/fp%d.txt", i);
|
||||
gf[i] = fopen(fn, "w");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int close_files()
|
||||
{
|
||||
int i = 0;
|
||||
for (; i < 4; ++i)
|
||||
{
|
||||
if (gf[i] != 0)
|
||||
fclose(gf[i]);
|
||||
if (gfq[i] != 0)
|
||||
fclose(gfq[i]);
|
||||
if (gft[i] != 0)
|
||||
fclose(gft[i]);
|
||||
if (gfi[i] != 0)
|
||||
fclose(gfi[i]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
/*********************************************************************************************
|
||||
Description: data and files for debugging
|
||||
Copyright : All right reserved by NCIC.ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2024/04/09
|
||||
***********************************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
////////////////// for debug and test //////////////////////////
|
||||
|
||||
#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件,并记录debug信息
|
||||
// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度
|
||||
// #define GET_FULL_MATCH_READ // 获取完全匹配的reads
|
||||
// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
|
||||
// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据
|
||||
// #define GET_KSW_ALIGN_SEQ
|
||||
// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
extern FILE *gf[4], *gfq[4], *gft[4], *gfi[4];
|
||||
|
||||
extern uint64_t debug_num;
|
||||
extern uint64_t all_match_len;
|
||||
extern uint64_t all_seq_num;
|
||||
extern uint64_t all_type_hits;
|
||||
extern double seed_time;
|
||||
|
||||
int open_qti_files();
|
||||
|
||||
int open_debug_files();
|
||||
|
||||
int close_files();
|
||||
|
|
@ -0,0 +1,265 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2025- Zhonghai Zhang (ICT)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "bwt.h"
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
#include "utils.h"
|
||||
#include "kvec.h"
|
||||
#include "hyb_idx.h"
|
||||
|
||||
|
||||
#ifdef _DIVBWT
|
||||
#include "divsufsort.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
#include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int bwa_bwt2fullbytesa(int argc, char* argv[]) // the "bwt2bytesa" command
|
||||
{
|
||||
bwt_t* bwt;
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: %s bwt2bytesa <in.bwt> <out.bytesa>\n", PROG_NAME);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
bwt_cal_byte_sa(bwt);
|
||||
_store_data_to_file(argv[optind + 1], bwt->byte_sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// extract suffix array from non-sampled suffix array
|
||||
int bwa_extract_sa(int argc, char* argv[]) {
|
||||
bwt_t* bwt;
|
||||
int c, sa_intv = 32;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i':
|
||||
sa_intv = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 3 > argc) {
|
||||
fprintf(stderr, "Usage: bwa extractsa [-i %d] <in.bwt> <in.full_bytesa> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
||||
fprintf(stderr, "bwt seq_len %ld, intv: %d\n", bwt->seq_len, sa_intv);
|
||||
//exit(0);
|
||||
|
||||
_load_file_to_data(argv[optind + 1], bwt->byte_sa);
|
||||
|
||||
if (bwt->sa) free(bwt->sa);
|
||||
bwt->sa_intv = sa_intv;
|
||||
bwt->n_sa = (bwt->seq_len + sa_intv) / sa_intv;
|
||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
uint64_t i;
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
if (i % sa_intv == 0)
|
||||
bwt->sa[i / sa_intv] = bwt_get_sa(bwt->byte_sa, i);
|
||||
}
|
||||
if (i % sa_intv == 0)
|
||||
bwt->sa[i / sa_intv] = bwt_get_sa(bwt->byte_sa, i);
|
||||
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
||||
// bwt_cal_sa(bwt, sa_intv);
|
||||
bwt_dump_sa(argv[optind + 2], bwt);
|
||||
// bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// extract byte suffix array from non-sampled suffix array
|
||||
int bwa_extract_byte_sa(int argc, char* argv[]) {
|
||||
bwt_t* bwt;
|
||||
int c, sa_intv = 32;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i':
|
||||
sa_intv = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 3 > argc) {
|
||||
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
||||
// fprintf(stderr, "bwt seq_len %ld, intv: %d\n", bwt->seq_len, sa_intv);
|
||||
// exit(0);
|
||||
|
||||
uint8_t *full_byte_sa = NULL;
|
||||
_load_file_to_data(argv[optind + 1], full_byte_sa);
|
||||
|
||||
bwt->sa_intv = sa_intv;
|
||||
bwt->n_sa = (bwt->seq_len + sa_intv) / sa_intv;
|
||||
bwt->byte_sa = (uint8_t*)calloc(SA_BYTES(bwt->n_sa), 1);
|
||||
uint64_t i;
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
if (i % sa_intv == 0)
|
||||
bwt_set_sa(bwt->byte_sa, i / sa_intv, bwt_get_sa(full_byte_sa, i));
|
||||
}
|
||||
if (i % sa_intv == 0)
|
||||
bwt_set_sa(bwt->byte_sa, i / sa_intv, bwt_get_sa(full_byte_sa, i));
|
||||
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
|
||||
bwt_dump_byte_sa(argv[optind + 2], bwt);
|
||||
// bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字节表示(0A,1C,2G,3T)
|
||||
void gen_kmer_bits(uint8_t** seq_arr, uint64_t* kmer_arr_size, int kmer_len) {
|
||||
uint64_t i;
|
||||
uint64_t seq_up_val = (1L << (kmer_len << 1));
|
||||
*kmer_arr_size = seq_up_val;
|
||||
*seq_arr = (uint8_t*)realloc(*seq_arr, seq_up_val * (uint64_t)kmer_len);
|
||||
for (i = 0; i < seq_up_val; ++i) {
|
||||
const uint64_t base_idx = i * kmer_len;
|
||||
for (int j = kmer_len - 1; j >= 0; --j) {
|
||||
(*seq_arr)[base_idx + kmer_len - 1 - j] = (i >> (j << 1)) & 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bwt_create_kmer_idx(bwt_t* bwt) {
|
||||
uint64_t kmer_arr_size = 0;
|
||||
uint8_t* seq_arr = 0;
|
||||
bwtintv_t ik, ok[4];
|
||||
uint64_t j;
|
||||
int i, c1;
|
||||
int qlen = 10;
|
||||
KmerHash* kh = &bwt->kmer_hash;
|
||||
|
||||
kh->ke10 = (KmerEntryArr*)malloc((1 << (10 << 1)) * sizeof(KmerEntryArr));
|
||||
kh->ke11 = (KmerEntry*)malloc((1 << (11 << 1)) * sizeof(KmerEntry));
|
||||
kh->ke12 = (KmerEntry*)malloc((1 << (12 << 1)) * sizeof(KmerEntry));
|
||||
kh->ke13 = (KmerEntry*)malloc((1 << (13 << 1)) * sizeof(KmerEntry));
|
||||
kh->ke14 = (KmerEntry*)malloc((1 << (14 << 1)) * sizeof(KmerEntry));
|
||||
|
||||
// 长度为10的kmer
|
||||
gen_kmer_bits(&seq_arr, &kmer_arr_size, 10);
|
||||
for (j = 0; j < kmer_arr_size; ++j) {
|
||||
uint8_t* q = &seq_arr[j * 10];
|
||||
uint8_t* mem_addr = kh->ke10[j].intv_arr;
|
||||
bwt_set_intv(bwt, q[0], ik);
|
||||
kmer_setval_at(mem_addr, ik, 0);
|
||||
// 每次扩展一个碱基
|
||||
for (i = 1; i < qlen; ++i) {
|
||||
c1 = 3 - q[i];
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
ik = ok[c1];
|
||||
kmer_setval_at(mem_addr, ik, i);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "Kmer idx for length 10 done.\n");
|
||||
#define GEN_KMER_FOR_LEN(len) \
|
||||
do { \
|
||||
gen_kmer_bits(&seq_arr, &kmer_arr_size, (len)); \
|
||||
for (j = 0; j < kmer_arr_size; ++j) { \
|
||||
bwtintv_t p = bwt_forward_search(bwt, &seq_arr[j * (len)], (len)); \
|
||||
kmer_setval_at(bwt->kmer_hash.ke##len[j].intv_arr, p, 0); \
|
||||
} \
|
||||
fprintf(stderr, "Kmer idx for length %d done.\n", len); \
|
||||
} while (0)
|
||||
|
||||
GEN_KMER_FOR_LEN(11); // 长度11的kmer
|
||||
GEN_KMER_FOR_LEN(12);
|
||||
GEN_KMER_FOR_LEN(13);
|
||||
GEN_KMER_FOR_LEN(14);
|
||||
free(seq_arr);
|
||||
}
|
||||
|
||||
int bwa_bwt2kmer(int argc, char* argv[]) {
|
||||
bwt_t* bwt;
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
||||
bwt_create_kmer_idx(bwt);
|
||||
|
||||
bwt_dump_kmer_idx(argv[optind + 1], &bwt->kmer_hash);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2hyb(int argc, char* argv[]) {}
|
||||
|
||||
// 创建正向的kmer
|
||||
inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
|
||||
uint64_t qbit = 0, i;
|
||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||
for (i = 0; i < qlen; ++i) {
|
||||
if (q[i] > 3)
|
||||
break; // 要考虑碱基是N
|
||||
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
|
||||
}
|
||||
*base_consumed = i;
|
||||
return qbit;
|
||||
}
|
||||
|
||||
int hyb_test(int argc, char* argv[]) {
|
||||
bwt_t* bwt;
|
||||
if (optind + 1 > argc) {
|
||||
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
char fn[MAX_PATH];
|
||||
snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]);
|
||||
|
||||
bwt = bwt_restore_bwt(fn);
|
||||
snprintf(fn, MAX_PATH, "%s.hyb.kmer", argv[optind]);
|
||||
bwt->kmer_hash = bwt_restore_kmer_idx(fn);
|
||||
|
||||
uint8_t test_seq[14] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1}; // ACGTACGTACGTAC
|
||||
bwtintv_t ok1, ok2;
|
||||
ok1 = bwt_forward_search(bwt, test_seq, 14);
|
||||
|
||||
int base_consumed = 0;
|
||||
uint64_t qbit = build_forward_kmer(test_seq, 14, 14, &base_consumed);
|
||||
|
||||
bwt_kmer_get(&bwt->kmer_hash, &ok2, qbit, 13);
|
||||
|
||||
fprintf(stderr, "forward search: [%lu, %lu, %lu), kmer idx: [%lu, %lu, %lu)\n", ok1.x[0], ok1.x[1], ok1.x[2], ok2.x[0], ok2.x[1], ok2.x[2]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "debug.h"
|
||||
#include "kvec.h"
|
||||
#include "utils.h"
|
||||
|
||||
47
main.c
47
main.c
|
|
@ -54,8 +54,12 @@ int main_pemerge(int argc, char *argv[]);
|
|||
int main_maxk(int argc, char *argv[]);
|
||||
|
||||
int bwa_bwt2kmer(int argc, char* argv[]); // create kmer-index from bwt
|
||||
int bwa_bwt2bytesa(int argc, char* argv[]); // create byte-based Suffix-Array
|
||||
int bwa_bwt2fullbytesa(int argc, char* argv[]); // create full byte-based Suffix-Array
|
||||
int bwa_bwt2hyb(int argc, char* argv[]); // create hybrid-index
|
||||
int bwa_extract_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array
|
||||
int bwa_extract_byte_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array
|
||||
|
||||
int hyb_test(int argc, char* argv[]); // for test
|
||||
|
||||
static int usage()
|
||||
{
|
||||
|
|
@ -64,24 +68,26 @@ static int usage()
|
|||
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
|
||||
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
|
||||
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
|
||||
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
|
||||
fprintf(stderr, " aln gapped/ungapped alignment\n");
|
||||
fprintf(stderr, " samse generate alignment (single ended)\n");
|
||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||
fprintf(stderr, " bwasw BWA-SW for long queries (DEPRECATED)\n");
|
||||
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
|
||||
fprintf(stderr, " aln gapped/ungapped alignment\n");
|
||||
fprintf(stderr, " samse generate alignment (single ended)\n");
|
||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||
fprintf(stderr, " bwasw BWA-SW for long queries (DEPRECATED)\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " shm manage indices in shared memory\n");
|
||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n");
|
||||
fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n");
|
||||
fprintf(stderr, " shm manage indices in shared memory\n");
|
||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2fullbytesa generate SA(using byte array) from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n");
|
||||
fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n");
|
||||
fprintf(stderr, " extractsa generate sa from full byte suffix array\n");
|
||||
fprintf(stderr, " extractbytesa generate byte sa from full byte suffix array\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr,
|
||||
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
|
||||
|
|
@ -119,9 +125,12 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "bwt2fullbytesa") == 0) ret = bwa_bwt2fullbytesa(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "bwt2kmer") == 0) ret = bwa_bwt2kmer(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "bwt2hyb") == 0) ret = bwa_bwt2hyb(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "extractsa") == 0) ret = bwa_extract_sa(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "extractbytesa") == 0) ret = bwa_extract_byte_sa(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "hybtest") == 0) ret = hyb_test(argc - 1, argv + 1);
|
||||
else {
|
||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||
return 1;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,221 @@
|
|||
/*
|
||||
Description: profiling related data
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2024/04/06
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "utils.h"
|
||||
#include "profiling.h"
|
||||
#include "debug.h"
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0};
|
||||
uint64_t proc_freq = 1000;
|
||||
uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0};
|
||||
#endif
|
||||
|
||||
#ifdef SHOW_DATA_PERF
|
||||
/*
|
||||
tdat[0]: read nums
|
||||
tdat[1]: seed-1 full match nums
|
||||
*/
|
||||
int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
|
||||
int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
|
||||
int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
|
||||
int64_t gd1[LIM_GLOBAL_DATA_TYPE] = {0};
|
||||
|
||||
#endif
|
||||
|
||||
int64_t sum(int64_t *a, int len) {
|
||||
int64_t res = 0;
|
||||
int i = 0;
|
||||
for (i=0; i<len; ++i) {
|
||||
res += a[i];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
int find_opt(uint64_t *a, int len, double *max, double *min, double *avg)
|
||||
{
|
||||
int i = 0;
|
||||
uint64_t umax = 0, umin = UINT64_MAX, uavg = 0;
|
||||
for (i = 0; i < len; i++)
|
||||
{
|
||||
if (a[i] > umax) umax = a[i];
|
||||
if (a[i] < umin) umin = a[i];
|
||||
uavg += a[i];
|
||||
}
|
||||
*avg = uavg * 1.0 / len / proc_freq;
|
||||
*max = umax * 1.0 / proc_freq;
|
||||
*min = umin * 1.0 / proc_freq;
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint64_t get_sum(uint64_t *a, int len) {
|
||||
int i = 0;
|
||||
uint64_t all = 0;
|
||||
for (i = 0; i < len; i++) {
|
||||
all += a[i];
|
||||
}
|
||||
return all;
|
||||
}
|
||||
|
||||
int display_stats(int nthreads)
|
||||
{
|
||||
#ifdef SHOW_PERF
|
||||
double avg, max, min;
|
||||
int i;
|
||||
|
||||
fprintf(stderr, "[steps in main_mem]\n");
|
||||
fprintf(stderr, "time_parse_arg: %0.2lf s\n", gprof[G_PREPARE] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_load_idx: %0.2lf s\n", gprof[G_LOAD_IDX] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_pipeline: %0.2lf s\n", gprof[G_PIPELINE] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_all: %0.2lf s\n", gprof[G_ALL] * 1.0 / proc_freq);
|
||||
|
||||
fprintf(stderr, "\n[steps in pipeline]\n");
|
||||
fprintf(stderr, "time_read: %0.2lf s\n", gprof[G_READ] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_compute: %0.2lf s\n", gprof[G_COMPUTE] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_write: %0.2lf s\n", gprof[G_WRITE] * 1.0 / proc_freq);
|
||||
|
||||
fprintf(stderr, "\n[steps in mem_process_seqs]\n");
|
||||
fprintf(stderr, "time_mem_prepare: %0.2lf s\n", gprof[G_MEM_PREPARE] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_mem_kernel: %0.2lf s\n", gprof[G_MEM_KERNEL] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_mem_pestat: %0.2lf s\n", gprof[G_MEM_PESTAT] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_mem_sam: %0.2lf s\n", gprof[G_MEM_SAM] * 1.0 / proc_freq);
|
||||
|
||||
fprintf(stderr, "\n[steps in kernel]\n");
|
||||
find_opt(tprof[T_SEED_ALL], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_seed_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
|
||||
find_opt(tprof[T_CHAIN_ALL], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_chain_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
|
||||
find_opt(tprof[T_ALN_ALL], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_aln_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
|
||||
find_opt(tprof[T_INS_SIZE], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_ins_size_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
|
||||
|
||||
fprintf(stderr, "\n[steps in seeding]\n");
|
||||
find_opt(tprof[T_SEED_1], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_seed_1: %0.2lf s %0.2lf s %0.2lf s\n", max, min, avg);
|
||||
find_opt(tprof[T_SEED_2], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_seed_2: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_SEED_3], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_seed_3: %0.2lf s\n", avg);
|
||||
|
||||
fprintf(stderr, "\n[steps in chain]\n");
|
||||
find_opt(tprof[T_GEN_CHAIN], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_gen_chain: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_FLT_CHAIN], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_flt_chain: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_FLT_CHANNED_SEEDS], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_flt_chained_seeds: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_SAL], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_sal: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_BSW], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_bsw: %0.2lf s\n", avg);
|
||||
|
||||
fprintf(stderr, "\n[steps in gen sam]\n");
|
||||
find_opt(tprof[T_SAM_MATESW], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_mate_sw: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_KSW_ALIGN2], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_ksw_align2: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_KSW_LOOP], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_KSW_REVERSE], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_ksw_reverse: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_SAM_REG2ALN], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_reg2aln: %0.2lf s\n", avg);
|
||||
|
||||
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq);
|
||||
|
||||
fprintf(stderr, "seq num: %ld\n", gdat[0]);
|
||||
fprintf(stderr, "full num: %ld\n", gdat[1]);
|
||||
fprintf(stderr, "percent: %0.2lf%c\n", (double)gdat[1] / gdat[0] * 100, '%');
|
||||
|
||||
fprintf(stderr, "all_match_len: %ld\n", all_match_len);
|
||||
fprintf(stderr, "all_seq_num: %ld\n", all_seq_num);
|
||||
fprintf(stderr, "all_type_hits: %ld\n", all_type_hits);
|
||||
fprintf(stderr, "seed_time: %f\n", seed_time);
|
||||
fprintf(stderr, "all_match_len: %ld\n", get_sum(tprof[T_SEED_LEN], nthreads));
|
||||
|
||||
#define PRINT_SEED_TIME(mark) \
|
||||
find_opt(tprof[T_SEED_##mark], nthreads, &max, &min, &avg); \
|
||||
fprintf(stderr, "time_seed_%s: %0.2lf s %0.2lf s %0.2lf s\n", #mark, max, min, avg);
|
||||
|
||||
#if 1
|
||||
// PRINT_SEED_TIME(1_ALL);
|
||||
// PRINT_SEED_TIME(1_0);
|
||||
// PRINT_SEED_TIME(1_1);
|
||||
// PRINT_SEED_TIME(1_2);
|
||||
// PRINT_SEED_TIME(1_3);
|
||||
PRINT_SEED_TIME(1_3_1);
|
||||
// PRINT_SEED_TIME(1_3_2);
|
||||
// PRINT_SEED_TIME(1_3_3);
|
||||
// PRINT_SEED_TIME(1_3_4);
|
||||
// PRINT_SEED_TIME(1_3_5);
|
||||
// PRINT_SEED_TIME(1_3_6);
|
||||
// PRINT_SEED_TIME(1_3_7);
|
||||
#endif
|
||||
#if 1
|
||||
// PRINT_SEED_TIME(2_ALL);
|
||||
// PRINT_SEED_TIME(2_0);
|
||||
// PRINT_SEED_TIME(2_1);
|
||||
// PRINT_SEED_TIME(2_2);
|
||||
PRINT_SEED_TIME(2_2_0);
|
||||
// PRINT_SEED_TIME(2_2_1);
|
||||
// PRINT_SEED_TIME(2_2_2);
|
||||
// PRINT_SEED_TIME(2_2_3);
|
||||
#endif
|
||||
#if 1
|
||||
// PRINT_SEED_TIME(3_ALL);
|
||||
// PRINT_SEED_TIME(3_0);
|
||||
// PRINT_SEED_TIME(3_1);
|
||||
// PRINT_SEED_TIME(3_2);
|
||||
// PRINT_SEED_TIME(3_3);
|
||||
PRINT_SEED_TIME(3_3_0);
|
||||
// PRINT_SEED_TIME(3_3_1);
|
||||
// PRINT_SEED_TIME(3_3_2);
|
||||
#endif
|
||||
double all = 0;
|
||||
for (i = 0; i < 50; ++i) {
|
||||
//all += sum(tdat[i], nthreads);
|
||||
// fprintf(stderr, "sum %d: %ld\n", i, sum(tdat[i], nthreads));
|
||||
}
|
||||
for (i = 0; i < 50; ++i) {
|
||||
//all += sum(tdat[i], nthreads);
|
||||
// fprintf(stderr, "%d: %f\n", i, sum(tdat[i], nthreads) * 100 / all);
|
||||
}
|
||||
#if 0
|
||||
uint64_t b64 = 0, u64 = 0;
|
||||
for (i = 0; i < 256; ++i) {
|
||||
uint64_t s = sum(t_sd[i], nthreads);
|
||||
if (i < 64)
|
||||
b64 += s;
|
||||
else
|
||||
u64 += s;
|
||||
fprintf(stderr, "addr %d: %ld\n", i, s);
|
||||
}
|
||||
fprintf(stderr, "b64 %ld; u64 %ld\n", b64, u64);
|
||||
#endif
|
||||
// fprintf(stderr, "sum 0: %ld\n", sum(tdat[TD_SEED_1_0], nthreads));
|
||||
// fprintf(stderr, "sum 1: %ld\n", sum(tdat[TD_SEED_1_1], nthreads));
|
||||
// fprintf(stderr, "sum 2: %ld\n", sum(tdat[TD_SEED_1_2], nthreads));
|
||||
// fprintf(stderr, "sum 3: %ld\n", sum(tdat[TD_SEED_1_3], nthreads));
|
||||
// fprintf(stderr, "sum 4: %ld\n", sum(tdat[TD_SEED_1_4], nthreads));
|
||||
// fprintf(stderr, "sum 5: %ld\n", sum(tdat[TD_SEED_1_5], nthreads));
|
||||
// int i;
|
||||
// for (i=0; i<LIM_THREAD_DATA_TYPE; ++i) {
|
||||
// for (i = 1; i <= 132; ++i) {
|
||||
// fprintf(stderr, "len: %d, sum: %ld\n", i, sum(tdat[i], nthreads));
|
||||
// fprintf(stderr, "%ld,\n", sum(tdat[i], nthreads));
|
||||
// }
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
Description: profiling related data
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2024/04/06
|
||||
*/
|
||||
#ifndef PROFILING_H_
|
||||
#define PROFILING_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define USE_RDTSC 1
|
||||
|
||||
#define LIM_THREAD 128
|
||||
#define LIM_THREAD_PROF_TYPE 128
|
||||
#define LIM_GLOBAL_PROF_TYPE 128
|
||||
#define LIM_THREAD_DATA_TYPE 256
|
||||
#define LIM_GLOBAL_DATA_TYPE 256
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
extern uint64_t proc_freq;
|
||||
extern uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD];
|
||||
extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
|
||||
#endif
|
||||
|
||||
#ifdef SHOW_DATA_PERF
|
||||
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
|
||||
extern int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD];
|
||||
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
|
||||
extern int64_t gd1[LIM_GLOBAL_DATA_TYPE];
|
||||
#endif
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
#if USE_RDTSC
|
||||
#define PROF_START(tmp_time) const uint64_t prof_tmp_##tmp_time = __rdtsc()
|
||||
#define PROF_END(result, tmp_time) result += __rdtsc() - prof_tmp_##tmp_time
|
||||
#else
|
||||
#define PROF_START(tmp_time) uint64_t prof_tmp_##tmp_time = realtime_msec()
|
||||
#define PROF_END(result, tmp_time) result += realtime_msec() - prof_tmp_##tmp_time
|
||||
#endif
|
||||
#else
|
||||
#define PROF_START(tmp_time)
|
||||
#define PROF_END(result, tmp_time)
|
||||
#endif
|
||||
|
||||
// tdat
|
||||
enum {
|
||||
TD_SEED_1_0 = 0,
|
||||
TD_SEED_1_1,
|
||||
TD_SEED_1_2,
|
||||
TD_SEED_1_3,
|
||||
TD_SEED_1_4,
|
||||
TD_SEED_1_5,
|
||||
};
|
||||
|
||||
// GLOBAL
|
||||
enum {
|
||||
G_ALL = 0,
|
||||
G_PIPELINE,
|
||||
G_READ,
|
||||
G_WRITE,
|
||||
G_COMPUTE,
|
||||
G_PREPARE,
|
||||
G_LOAD_IDX,
|
||||
G_MEM_PREPARE,
|
||||
G_MEM_KERNEL,
|
||||
G_MEM_PESTAT,
|
||||
G_MEM_SAM,
|
||||
G_KSW_LOOP,
|
||||
G_KSW_END_LOOP,
|
||||
G_read_seq
|
||||
};
|
||||
|
||||
// THREAD
|
||||
enum {
|
||||
T_SEED_ALL = 0,
|
||||
T_CHAIN_ALL,
|
||||
T_ALN_ALL,
|
||||
T_INS_SIZE,
|
||||
T_SEED_1,
|
||||
T_SEED_2,
|
||||
T_SEED_3,
|
||||
T_SAL,
|
||||
T_GEN_CHAIN,
|
||||
T_FLT_CHAIN,
|
||||
T_FLT_CHANNED_SEEDS,
|
||||
T_READ_SA,
|
||||
T_BSW,
|
||||
T_BSW_ALL,
|
||||
T_SAM_MATESW,
|
||||
T_KSW_ALIGN2,
|
||||
T_KSW_REVERSE,
|
||||
T_SAM_REG2ALN,
|
||||
T_KSW_LOOP,
|
||||
T_SEED_LEN,
|
||||
T_SEED_1_ALL,
|
||||
T_SEED_1_0,
|
||||
T_SEED_1_1,
|
||||
T_SEED_1_2,
|
||||
T_SEED_1_3,
|
||||
T_SEED_1_3_1,
|
||||
T_SEED_1_3_2,
|
||||
T_SEED_1_3_3,
|
||||
T_SEED_1_3_4,
|
||||
T_SEED_1_3_5,
|
||||
T_SEED_1_3_6,
|
||||
T_SEED_1_3_7,
|
||||
T_SEED_2_ALL,
|
||||
T_SEED_2_0,
|
||||
T_SEED_2_1,
|
||||
T_SEED_2_2,
|
||||
T_SEED_2_2_0,
|
||||
T_SEED_2_2_1,
|
||||
T_SEED_2_2_2,
|
||||
T_SEED_2_2_3,
|
||||
T_SEED_3_ALL,
|
||||
T_SEED_3_0,
|
||||
T_SEED_3_1,
|
||||
T_SEED_3_2,
|
||||
T_SEED_3_3,
|
||||
T_SEED_3_3_0,
|
||||
T_SEED_3_3_1,
|
||||
T_SEED_3_3_2
|
||||
};
|
||||
|
||||
int display_stats(int);
|
||||
|
||||
#endif
|
||||
4
rle.h
4
rle.h
|
|
@ -4,10 +4,14 @@
|
|||
#include <stdint.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifndef LIKELY
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
#endif
|
||||
#else
|
||||
#ifndef LIKELY
|
||||
#define LIKELY(x) (x)
|
||||
#endif
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
|
||||
extern "C" {
|
||||
|
|
|
|||
67
utils.h
67
utils.h
|
|
@ -27,8 +27,10 @@
|
|||
#ifndef LH3_UTILS_H
|
||||
#define LH3_UTILS_H
|
||||
|
||||
#include <getopt.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
|
|
@ -47,6 +49,71 @@
|
|||
|
||||
#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg)
|
||||
|
||||
#define PROG_NAME "hybalign"
|
||||
|
||||
///////////// added for hyb-align /////////////
|
||||
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
|
||||
#undef MAX_PATH
|
||||
#define MAX_PATH 2048
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifndef LIKELY
|
||||
#define LIKELY(x) __builtin_expect((x), 1)
|
||||
#endif
|
||||
#ifndef UNLIKELY
|
||||
#define UNLIKELY(x) __builtin_expect((x), 0)
|
||||
#endif
|
||||
#else
|
||||
#ifndef LIKELY
|
||||
#define LIKELY(x) (x)
|
||||
#endif
|
||||
#ifndef UNLIKELY
|
||||
#define UNLIKELY(x) (x)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define err_check_true(ret_code, right_val) \
|
||||
if ((ret_code) != (right_val)) \
|
||||
err_fatal("err_check", "Value not right: True-Val %d\n", right_val)
|
||||
|
||||
#define err_check_false(ret_code, err_val) \
|
||||
if ((ret_code) == (err_val)) \
|
||||
err_fatal("err_check", "Value not right: False-Val %d\n", err_val)
|
||||
|
||||
#define _load_file_to_data(fn, data) \
|
||||
do { \
|
||||
FILE* fp = NULL; \
|
||||
struct stat st; \
|
||||
err_check_true(stat(fn, &st), 0); \
|
||||
fp = xopen(fn, "r"); \
|
||||
data = (uint8_t*)malloc(st.st_size); \
|
||||
err_fread_noeof(data, 1, st.st_size, fp); \
|
||||
err_fclose(fp); \
|
||||
} while (0)
|
||||
|
||||
#define _load_file_to_data_ps(prefix, suffix, data) \
|
||||
do { \
|
||||
char fn[MAX_PATH]; \
|
||||
sprintf(fn, "%s%s", prefix, suffix); \
|
||||
_load_file_to_data(fn, data); \
|
||||
} while (0)
|
||||
|
||||
#define _store_data_to_file(fn, data, ele_size, nums) \
|
||||
do { \
|
||||
FILE* fp; \
|
||||
fp = xopen(fn, "wb"); \
|
||||
err_fwrite(data, ele_size, nums, fp); \
|
||||
err_fflush(fp); \
|
||||
err_fclose(fp); \
|
||||
} while (0)
|
||||
|
||||
///////////// end of added for hyb-align /////////////
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} pair64_t;
|
||||
|
|
|
|||
Loading…
Reference in New Issue