添加了创建byte_sa和kmer index的代码

This commit is contained in:
zzh 2025-11-07 21:31:11 +08:00
parent b5ee622884
commit 74b464ee4a
20 changed files with 1101 additions and 25 deletions

8
.gitignore vendored
View File

@ -5,7 +5,13 @@ test64
.*.swp
Makefile.bak
bwamem-lite
test_index/
index/
orig_index/
run.sh
debug.sh
hybalign
*.sam
# ---> C
# Prerequisites
*.d

View File

@ -4,13 +4,15 @@ CFLAGS= -g -Wall -Wno-unused-function -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwape.o kopen.o pemerge.o maxk.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
bwtsw2_chain.o fastmap.o bwtsw2_pair.o \
$(HYBOBJS)
PROG= hybalign
INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
@ -34,7 +36,7 @@ endif
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(PROG):libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
bwamem-lite:libbwa.a example.o

2
bwa.c
View File

@ -280,7 +280,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
strcat(strcpy(tmp, prefix), ".32.sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
free(tmp); free(prefix);
return bwt;

213
bwt.c
View File

@ -467,3 +467,216 @@ void bwt_destroy(bwt_t *bwt)
free(bwt->sa); free(bwt->bwt);
free(bwt);
}
////////////////////////////////// new funcs for hyb-align ////////////////////////////////
// 设置某一行的排序值sa的索引有效值从1开始0设置为-1, 小端模式)
void bwt_set_sa(uint8_t* sa_arr, bwtint_t k, bwtint_t val) {
// const bwtint_t block_idx = (k >> 3) * 33;
const bwtint_t block_idx = ((k >> 3) << 5) + (k >> 3); // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t* sa_addr = (bwtint_t*)(sa_arr + start_byte_idx);
// *sa_addr &= (1 << val_idx_in_block) - 1; //
// 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值
*sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block;
}
// 获取某一行的排序值(小端模式)
bwtint_t bwt_get_sa(uint8_t* sa_arr, bwtint_t k) {
// const bwtint_t block_idx = (k >> 3) * 33;
const bwtint_t block_idx = ((k >> 3) << 5) + (k >> 3); // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t val = *(bwtint_t*)(sa_arr + start_byte_idx);
val = (val >> val_idx_in_block) & 8589934591;
return val;
}
/*
* bwtint_t bwt_get_sa(uint8_t* sa_arr, bwtint_t k) {
* const bwtint_t start_byte = ((k << 5) + k) >> 3; // 存储这个sa数据的起始字节
* bwtint_t val = *(bwtint_t*)(sa_arr + start_byte);
* val = (val >> (k & 7)) & 8589934591;
* return val;
* }
*/
// 第一次创建索引时创建不采样的SA(24G)和采样间隔为4的SA(6G)
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_byte_sa(bwt_t* bwt) {
bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引sa表示位置
double tmp_time, elapsed_time;
int intv = 1;
int intv_round = intv; // 间隔多少来保存一个位置信息
kv_roundup32(intv_round);
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
if (bwt->byte_sa)
free(bwt->byte_sa);
bwt->sa_intv = intv;
bwt->n_sa = (bwt->seq_len + intv) / intv;
bwt->byte_sa = (uint8_t*)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置
fprintf(stderr, "bytes: %ld, sa size: %ld\n", SA_BYTES(bwt->n_sa), bwt->n_sa);
// calculate SA value
isa = 0;
sa = bwt->seq_len;
block_size = bwt->seq_len / 100;
tmp_time = realtime();
for (i = 0; i < bwt->seq_len; ++i) {
if (i % block_size == 0) {
elapsed_time = realtime() - tmp_time;
fprintf(stderr, "%ld%% percent complished. %f s elapsed.\n", i / block_size, elapsed_time);
}
if (isa % intv == 0) {
bwt_set_sa(bwt->byte_sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度
if (i % (block_size / 2) == 0) {
fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->byte_sa, isa / intv)); // 验证结果正确性
}
}
--sa; // 从后往前一个位置一个位置的找对应的后缀数组isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
}
if (isa % intv == 0)
bwt_set_sa(bwt->byte_sa, isa / intv, sa);
// bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题set_sa那里已经修正了
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // 2的33次方-1 before this line, bwt->sa[0] = bwt->seq_len
}
// 保存byte_sa
void bwt_dump_byte_sa(const char* fn, const bwt_t* bwt) {
FILE* fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->byte_sa, 1, SA_BYTES(bwt->n_sa), fp);
err_fflush(fp);
err_fclose(fp);
}
// 设置kmer第pos个碱基对应的fmt匹配信息
void kmer_setval_at(uint8_t* mem_addr, bwtintv_t ik, int pos) {
int byte_idx = pos * 14;
uint8_t* arr = mem_addr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t*)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t*)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t*)arr) = (uint32_t)ik.x[2];
}
// 获取kmer的bwt匹配信息
void kmer_getval_at(uint8_t* mem_addr, bwtintv_t* ok, int pos) {
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t* arr = mem_addr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t*)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t*)(arr + 1));
arr += 5;
x2 = *((uint32_t*)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 获取kmer对应的fmt匹配信息, pos should be [0, 13], qbit一定是14个碱基对应的编码, pos是指在14个碱基中的位置
void bwt_kmer_get(const KmerHash* kmer_hash, bwtintv_t* ok, uint32_t qbit, int pos) {
#if HASH_KMER_LEN == 14
if (pos == 13)
kmer_getval_at(kmer_hash->ke14[qbit].intv_arr, ok, 0);
else if (pos == 12)
kmer_getval_at(kmer_hash->ke13[qbit >> 2].intv_arr, ok, 0);
else if (pos == 11)
kmer_getval_at(kmer_hash->ke12[qbit >> 4].intv_arr, ok, 0);
else if (pos == 10) // 其实对应kmer长度为10
kmer_getval_at(kmer_hash->ke11[qbit >> 6].intv_arr, ok, 0);
else
kmer_getval_at(kmer_hash->ke10[qbit >> 8].intv_arr, ok, pos);
#elif HASH_KMER_LEN == 13
if (pos == 12)
kmer_getval_at(kmer_hash->ke13[qbit].intv_arr, ok, 0);
else if (pos == 11)
kmer_getval_at(kmer_hash->ke12[qbit >> 2].intv_arr, ok, 0);
else if (pos == 10)
kmer_getval_at(kmer_hash->ke11[qbit >> 4].intv_arr, ok, 0);
else
kmer_getval_at(kmer_hash->ke10[qbit >> 6].intv_arr, ok, pos);
#else
if (pos == 11)
kmer_getval_at(kmer_hash->ke12[qbit].intv_arr, ok, 0);
else if (pos == 10)
kmer_getval_at(kmer_hash->ke11[qbit >> 2].intv_arr, ok, 0);
else
kmer_getval_at(kmer_hash->ke10[qbit >> 4].intv_arr, ok, pos);
#endif
}
// 只是用来计算kmer-index所以q中只包含A/C/G/T不包含N
bwtintv_t bwt_forward_search(bwt_t* bwt, const uint8_t* q, int qlen) {
bwtintv_t ik, ok[4];
int i, c1;
bwt_set_intv(bwt, q[0], ik);
// 每次扩展一个碱基
for (i = 1; i < qlen; ++i) {
c1 = 3 - q[i];
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c1];
}
ik.info = qlen;
return ik;
}
// 将kmer hash数据写入到文件
void bwt_dump_kmer_idx(const char* fn, const KmerHash* kh) {
FILE* fp;
fp = xopen(fn, "wb");
err_fwrite(kh->ke10, 1, (1 << (10 << 1)) * sizeof(KmerEntryArr), fp);
err_fwrite(kh->ke11, 1, (1 << (11 << 1)) * sizeof(KmerEntry), fp);
err_fwrite(kh->ke12, 1, (1 << (12 << 1)) * sizeof(KmerEntry), fp);
#if HASH_KMER_LEN > 12
err_fwrite(kh->ke13, 1, (1 << (13 << 1)) * sizeof(KmerEntry), fp);
#endif
#if HASH_KMER_LEN > 13
err_fwrite(kh->ke14, 1, (1 << (14 << 1)) * sizeof(KmerEntry), fp);
#endif
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取kmer hash信息
KmerHash bwt_restore_kmer_idx(const char* fn) {
FILE* fp = xopen(fn, "rb");
KmerHash khash;
KmerHash* kh = &khash;
#define LOAD_KMER_HASH(kmer_len) \
do { \
len = 1 << (kmer_len << 1); \
kh->ke##kmer_len = (KmerEntry*)malloc(len * sizeof(KmerEntry)); \
fread_fix(fp, len * sizeof(KmerEntry), kh->ke##kmer_len); \
} while (0)
int len = 1 << (10 << 1);
kh->ke10 = (KmerEntryArr*)malloc(len * sizeof(KmerEntryArr));
fread_fix(fp, len * sizeof(KmerEntryArr), kh->ke10);
LOAD_KMER_HASH(11);
LOAD_KMER_HASH(12);
#if HASH_KMER_LEN > 12
LOAD_KMER_HASH(13);
#endif
#if HASH_KMER_LEN > 13
LOAD_KMER_HASH(14);
#endif
err_fclose(fp);
return khash;
}

39
bwt.h
View File

@ -45,6 +45,28 @@ typedef unsigned char ubyte_t;
typedef uint64_t bwtint_t;
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8) /* 8字节对齐*/
#define HASH_KMER_LEN 14
// 用来保存kmer对应的bwt的位置信息
typedef struct {
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
} KmerEntry;
typedef struct {
uint8_t intv_arr[140]; // 保存长度为10的kmer每个碱基对应的bwt匹配信息
} KmerEntryArr;
// 保存各个位置对应的bwt匹配信息
typedef struct {
KmerEntryArr* ke10;
KmerEntry* ke11;
KmerEntry* ke12;
KmerEntry* ke13;
KmerEntry* ke14;
} KmerHash;
typedef struct {
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
@ -57,6 +79,8 @@ typedef struct {
int sa_intv;
bwtint_t n_sa;
bwtint_t *sa;
uint8_t* byte_sa; // byte-based Suffix-Array
KmerHash kmer_hash; // 保存kmer对应的bwt位置信息
} bwt_t;
typedef struct {
@ -125,8 +149,21 @@ extern "C" {
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
// new funcs for hyb-align
void bwt_cal_byte_sa(bwt_t* bwt);
void bwt_set_sa(uint8_t* sa_arr, bwtint_t k, bwtint_t val);
bwtint_t bwt_get_sa(uint8_t* sa_arr, bwtint_t k);
void kmer_getval_at(uint8_t* mem_addr, bwtintv_t* ok, int pos);
void kmer_setval_at(uint8_t* mem_addr, bwtintv_t ik, int pos);
void bwt_kmer_get(const KmerHash* kmer_hash, bwtintv_t* ok, uint32_t qbit, int pos);
void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt);
bwtintv_t bwt_forward_search(bwt_t* bwt, const uint8_t* q, int qlen);
void bwt_dump_kmer_idx(const char* fn, const KmerHash* kh);
KmerHash bwt_restore_kmer_idx(const char* fn);
#ifdef __cplusplus
}
}
#endif
#endif

View File

@ -269,6 +269,16 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
uint64_t seq_len = l_pac >> 1;
strcpy(str, prefix); strcat(str, ".ref-len");
FILE* f_len = fopen(str, "w");
fprintf(f_len, "%lu\n", seq_len);
fclose(f_len);
memset(str, 0, strlen(prefix) + 10);
//fprintf(stderr, "len: %ld, %ld\n", l_pac, seq_len);
//exit(0);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}

66
debug.c 100644
View File

@ -0,0 +1,66 @@
/*********************************************************************************************
Description: data and files for debugging
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/09
***********************************************************************************************/
#include <stdio.h>
#include "debug.h"
FILE *gf[4] = {0},
*gfq[4] = {0},
*gft[4] = {0},
*gfi[4] = {0};
uint64_t debug_num = 0;
uint64_t all_match_len = 0;
uint64_t all_seq_num = 0;
uint64_t all_type_hits = 0;
double seed_time = 0;
int open_qti_files()
{
char fn[1024] = {0};
int i = 0;
for (; i < 4; ++i)
{
sprintf(fn, "./output/q%d.txt", i);
gfq[i] = fopen(fn, "w");
sprintf(fn, "./output/t%d.txt", i);
gft[i] = fopen(fn, "w");
sprintf(fn, "./output/i%d.txt", i);
gfi[i] = fopen(fn, "w");
}
return 0;
}
int open_debug_files()
{
char fn[1024] = {0};
int i = 0;
for (; i < 4; ++i)
{
sprintf(fn, "./output/fp%d.txt", i);
gf[i] = fopen(fn, "w");
}
return 0;
}
int close_files()
{
int i = 0;
for (; i < 4; ++i)
{
if (gf[i] != 0)
fclose(gf[i]);
if (gfq[i] != 0)
fclose(gfq[i]);
if (gft[i] != 0)
fclose(gft[i]);
if (gfi[i] != 0)
fclose(gfi[i]);
}
return 0;
}

35
debug.h 100644
View File

@ -0,0 +1,35 @@
/*********************************************************************************************
Description: data and files for debugging
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/09
***********************************************************************************************/
#include <stdio.h>
#include <stdint.h>
////////////////// for debug and test //////////////////////////
#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件并记录debug信息
// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度以及最终扩展的长度
// #define GET_FULL_MATCH_READ // 获取完全匹配的reads
// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的querytarget和其他用于计算的数据
// #define GET_KSW_ALIGN_SEQ
// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
////////////////////////////////////////////////////////////////
extern FILE *gf[4], *gfq[4], *gft[4], *gfi[4];
extern uint64_t debug_num;
extern uint64_t all_match_len;
extern uint64_t all_seq_num;
extern uint64_t all_type_hits;
extern double seed_time;
int open_qti_files();
int open_debug_files();
int close_files();

265
hyb_bwa.c 100644
View File

@ -0,0 +1,265 @@
/* The MIT License
Copyright (c) 2025- Zhonghai Zhang (ICT)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "bwt.h"
#include "rle.h"
#include "rope.h"
#include "utils.h"
#include "kvec.h"
#include "hyb_idx.h"
#ifdef _DIVBWT
#include "divsufsort.h"
#endif
#ifdef USE_MALLOC_WRAPPERS
#include "malloc_wrap.h"
#endif
int bwa_bwt2fullbytesa(int argc, char* argv[]) // the "bwt2bytesa" command
{
bwt_t* bwt;
if (optind + 2 > argc) {
fprintf(stderr, "Usage: %s bwt2bytesa <in.bwt> <out.bytesa>\n", PROG_NAME);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_cal_byte_sa(bwt);
_store_data_to_file(argv[optind + 1], bwt->byte_sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3);
return 0;
}
// extract suffix array from non-sampled suffix array
int bwa_extract_sa(int argc, char* argv[]) {
bwt_t* bwt;
int c, sa_intv = 32;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i':
sa_intv = atoi(optarg);
break;
default:
return 1;
}
}
if (optind + 3 > argc) {
fprintf(stderr, "Usage: bwa extractsa [-i %d] <in.bwt> <in.full_bytesa> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
fprintf(stderr, "bwt seq_len %ld, intv: %d\n", bwt->seq_len, sa_intv);
//exit(0);
_load_file_to_data(argv[optind + 1], bwt->byte_sa);
if (bwt->sa) free(bwt->sa);
bwt->sa_intv = sa_intv;
bwt->n_sa = (bwt->seq_len + sa_intv) / sa_intv;
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
uint64_t i;
for (i = 0; i < bwt->seq_len; ++i) {
if (i % sa_intv == 0)
bwt->sa[i / sa_intv] = bwt_get_sa(bwt->byte_sa, i);
}
if (i % sa_intv == 0)
bwt->sa[i / sa_intv] = bwt_get_sa(bwt->byte_sa, i);
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
// bwt_cal_sa(bwt, sa_intv);
bwt_dump_sa(argv[optind + 2], bwt);
// bwt_destroy(bwt);
return 0;
}
// extract byte suffix array from non-sampled suffix array
int bwa_extract_byte_sa(int argc, char* argv[]) {
bwt_t* bwt;
int c, sa_intv = 32;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i':
sa_intv = atoi(optarg);
break;
default:
return 1;
}
}
if (optind + 3 > argc) {
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
// fprintf(stderr, "bwt seq_len %ld, intv: %d\n", bwt->seq_len, sa_intv);
// exit(0);
uint8_t *full_byte_sa = NULL;
_load_file_to_data(argv[optind + 1], full_byte_sa);
bwt->sa_intv = sa_intv;
bwt->n_sa = (bwt->seq_len + sa_intv) / sa_intv;
bwt->byte_sa = (uint8_t*)calloc(SA_BYTES(bwt->n_sa), 1);
uint64_t i;
for (i = 0; i < bwt->seq_len; ++i) {
if (i % sa_intv == 0)
bwt_set_sa(bwt->byte_sa, i / sa_intv, bwt_get_sa(full_byte_sa, i));
}
if (i % sa_intv == 0)
bwt_set_sa(bwt->byte_sa, i / sa_intv, bwt_get_sa(full_byte_sa, i));
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
bwt_dump_byte_sa(argv[optind + 2], bwt);
// bwt_destroy(bwt);
return 0;
}
// 生成所有KMER_LEN长度的序列字节表示(0A,1C,2G,3T)
void gen_kmer_bits(uint8_t** seq_arr, uint64_t* kmer_arr_size, int kmer_len) {
uint64_t i;
uint64_t seq_up_val = (1L << (kmer_len << 1));
*kmer_arr_size = seq_up_val;
*seq_arr = (uint8_t*)realloc(*seq_arr, seq_up_val * (uint64_t)kmer_len);
for (i = 0; i < seq_up_val; ++i) {
const uint64_t base_idx = i * kmer_len;
for (int j = kmer_len - 1; j >= 0; --j) {
(*seq_arr)[base_idx + kmer_len - 1 - j] = (i >> (j << 1)) & 3;
}
}
}
void bwt_create_kmer_idx(bwt_t* bwt) {
uint64_t kmer_arr_size = 0;
uint8_t* seq_arr = 0;
bwtintv_t ik, ok[4];
uint64_t j;
int i, c1;
int qlen = 10;
KmerHash* kh = &bwt->kmer_hash;
kh->ke10 = (KmerEntryArr*)malloc((1 << (10 << 1)) * sizeof(KmerEntryArr));
kh->ke11 = (KmerEntry*)malloc((1 << (11 << 1)) * sizeof(KmerEntry));
kh->ke12 = (KmerEntry*)malloc((1 << (12 << 1)) * sizeof(KmerEntry));
kh->ke13 = (KmerEntry*)malloc((1 << (13 << 1)) * sizeof(KmerEntry));
kh->ke14 = (KmerEntry*)malloc((1 << (14 << 1)) * sizeof(KmerEntry));
// 长度为10的kmer
gen_kmer_bits(&seq_arr, &kmer_arr_size, 10);
for (j = 0; j < kmer_arr_size; ++j) {
uint8_t* q = &seq_arr[j * 10];
uint8_t* mem_addr = kh->ke10[j].intv_arr;
bwt_set_intv(bwt, q[0], ik);
kmer_setval_at(mem_addr, ik, 0);
// 每次扩展一个碱基
for (i = 1; i < qlen; ++i) {
c1 = 3 - q[i];
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c1];
kmer_setval_at(mem_addr, ik, i);
}
}
fprintf(stderr, "Kmer idx for length 10 done.\n");
#define GEN_KMER_FOR_LEN(len) \
do { \
gen_kmer_bits(&seq_arr, &kmer_arr_size, (len)); \
for (j = 0; j < kmer_arr_size; ++j) { \
bwtintv_t p = bwt_forward_search(bwt, &seq_arr[j * (len)], (len)); \
kmer_setval_at(bwt->kmer_hash.ke##len[j].intv_arr, p, 0); \
} \
fprintf(stderr, "Kmer idx for length %d done.\n", len); \
} while (0)
GEN_KMER_FOR_LEN(11); // 长度11的kmer
GEN_KMER_FOR_LEN(12);
GEN_KMER_FOR_LEN(13);
GEN_KMER_FOR_LEN(14);
free(seq_arr);
}
int bwa_bwt2kmer(int argc, char* argv[]) {
bwt_t* bwt;
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n");
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_create_kmer_idx(bwt);
bwt_dump_kmer_idx(argv[optind + 1], &bwt->kmer_hash);
return 0;
}
int bwa_bwt2hyb(int argc, char* argv[]) {}
// 创建正向的kmer
inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
uint64_t qbit = 0, i;
qlen = qlen < kmer_len ? qlen : kmer_len;
for (i = 0; i < qlen; ++i) {
if (q[i] > 3)
break; // 要考虑碱基是N
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
}
*base_consumed = i;
return qbit;
}
int hyb_test(int argc, char* argv[]) {
bwt_t* bwt;
if (optind + 1 > argc) {
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n");
return 1;
}
char fn[MAX_PATH];
snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]);
bwt = bwt_restore_bwt(fn);
snprintf(fn, MAX_PATH, "%s.hyb.kmer", argv[optind]);
bwt->kmer_hash = bwt_restore_kmer_idx(fn);
uint8_t test_seq[14] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1}; // ACGTACGTACGTAC
bwtintv_t ok1, ok2;
ok1 = bwt_forward_search(bwt, test_seq, 14);
int base_consumed = 0;
uint64_t qbit = build_forward_kmer(test_seq, 14, 14, &base_consumed);
bwt_kmer_get(&bwt->kmer_hash, &ok2, qbit, 13);
fprintf(stderr, "forward search: [%lu, %lu, %lu), kmer idx: [%lu, %lu, %lu)\n", ok1.x[0], ok1.x[1], ok1.x[2], ok2.x[0], ok2.x[1], ok2.x[2]);
return 0;
}

11
hyb_idx.h 100644
View File

@ -0,0 +1,11 @@
#pragma once
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include "debug.h"
#include "kvec.h"
#include "utils.h"

0
hyb_seeding_1.c 100644
View File

0
hyb_seeding_2.c 100644
View File

0
hyb_seeding_3.c 100644
View File

0
hyb_utils.c 100644
View File

47
main.c
View File

@ -54,8 +54,12 @@ int main_pemerge(int argc, char *argv[]);
int main_maxk(int argc, char *argv[]);
int bwa_bwt2kmer(int argc, char* argv[]); // create kmer-index from bwt
int bwa_bwt2bytesa(int argc, char* argv[]); // create byte-based Suffix-Array
int bwa_bwt2fullbytesa(int argc, char* argv[]); // create full byte-based Suffix-Array
int bwa_bwt2hyb(int argc, char* argv[]); // create hybrid-index
int bwa_extract_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array
int bwa_extract_byte_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array
int hyb_test(int argc, char* argv[]); // for test
static int usage()
{
@ -64,24 +68,26 @@ static int usage()
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
fprintf(stderr, " aln gapped/ungapped alignment\n");
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries (DEPRECATED)\n");
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
fprintf(stderr, " aln gapped/ungapped alignment\n");
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries (DEPRECATED)\n");
fprintf(stderr, "\n");
fprintf(stderr, " shm manage indices in shared memory\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n");
fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n");
fprintf(stderr, " shm manage indices in shared memory\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
fprintf(stderr, " bwt2fullbytesa generate SA(using byte array) from BWT and Occ\n");
fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n");
fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n");
fprintf(stderr, " extractsa generate sa from full byte suffix array\n");
fprintf(stderr, " extractbytesa generate byte sa from full byte suffix array\n");
fprintf(stderr, "\n");
fprintf(stderr,
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
@ -119,9 +125,12 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2fullbytesa") == 0) ret = bwa_bwt2fullbytesa(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2kmer") == 0) ret = bwa_bwt2kmer(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2hyb") == 0) ret = bwa_bwt2hyb(argc - 1, argv + 1);
else if (strcmp(argv[1], "extractsa") == 0) ret = bwa_extract_sa(argc - 1, argv + 1);
else if (strcmp(argv[1], "extractbytesa") == 0) ret = bwa_extract_byte_sa(argc - 1, argv + 1);
else if (strcmp(argv[1], "hybtest") == 0) ret = hyb_test(argc - 1, argv + 1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;

221
profiling.c 100644
View File

@ -0,0 +1,221 @@
/*
Description: profiling related data
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2024/04/06
*/
#include <stdio.h>
#include "utils.h"
#include "profiling.h"
#include "debug.h"
#ifdef SHOW_PERF
uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0};
uint64_t proc_freq = 1000;
uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0};
#endif
#ifdef SHOW_DATA_PERF
/*
tdat[0]: read nums
tdat[1]: seed-1 full match nums
*/
int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
int64_t gd1[LIM_GLOBAL_DATA_TYPE] = {0};
#endif
int64_t sum(int64_t *a, int len) {
int64_t res = 0;
int i = 0;
for (i=0; i<len; ++i) {
res += a[i];
}
return res;
}
int find_opt(uint64_t *a, int len, double *max, double *min, double *avg)
{
int i = 0;
uint64_t umax = 0, umin = UINT64_MAX, uavg = 0;
for (i = 0; i < len; i++)
{
if (a[i] > umax) umax = a[i];
if (a[i] < umin) umin = a[i];
uavg += a[i];
}
*avg = uavg * 1.0 / len / proc_freq;
*max = umax * 1.0 / proc_freq;
*min = umin * 1.0 / proc_freq;
return 1;
}
uint64_t get_sum(uint64_t *a, int len) {
int i = 0;
uint64_t all = 0;
for (i = 0; i < len; i++) {
all += a[i];
}
return all;
}
int display_stats(int nthreads)
{
#ifdef SHOW_PERF
double avg, max, min;
int i;
fprintf(stderr, "[steps in main_mem]\n");
fprintf(stderr, "time_parse_arg: %0.2lf s\n", gprof[G_PREPARE] * 1.0 / proc_freq);
fprintf(stderr, "time_load_idx: %0.2lf s\n", gprof[G_LOAD_IDX] * 1.0 / proc_freq);
fprintf(stderr, "time_pipeline: %0.2lf s\n", gprof[G_PIPELINE] * 1.0 / proc_freq);
fprintf(stderr, "time_all: %0.2lf s\n", gprof[G_ALL] * 1.0 / proc_freq);
fprintf(stderr, "\n[steps in pipeline]\n");
fprintf(stderr, "time_read: %0.2lf s\n", gprof[G_READ] * 1.0 / proc_freq);
fprintf(stderr, "time_compute: %0.2lf s\n", gprof[G_COMPUTE] * 1.0 / proc_freq);
fprintf(stderr, "time_write: %0.2lf s\n", gprof[G_WRITE] * 1.0 / proc_freq);
fprintf(stderr, "\n[steps in mem_process_seqs]\n");
fprintf(stderr, "time_mem_prepare: %0.2lf s\n", gprof[G_MEM_PREPARE] * 1.0 / proc_freq);
fprintf(stderr, "time_mem_kernel: %0.2lf s\n", gprof[G_MEM_KERNEL] * 1.0 / proc_freq);
fprintf(stderr, "time_mem_pestat: %0.2lf s\n", gprof[G_MEM_PESTAT] * 1.0 / proc_freq);
fprintf(stderr, "time_mem_sam: %0.2lf s\n", gprof[G_MEM_SAM] * 1.0 / proc_freq);
fprintf(stderr, "\n[steps in kernel]\n");
find_opt(tprof[T_SEED_ALL], nthreads, &max, &min, &avg);
fprintf(stderr, "time_seed_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
find_opt(tprof[T_CHAIN_ALL], nthreads, &max, &min, &avg);
fprintf(stderr, "time_chain_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
find_opt(tprof[T_ALN_ALL], nthreads, &max, &min, &avg);
fprintf(stderr, "time_aln_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
find_opt(tprof[T_INS_SIZE], nthreads, &max, &min, &avg);
fprintf(stderr, "time_ins_size_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min);
fprintf(stderr, "\n[steps in seeding]\n");
find_opt(tprof[T_SEED_1], nthreads, &max, &min, &avg);
fprintf(stderr, "time_seed_1: %0.2lf s %0.2lf s %0.2lf s\n", max, min, avg);
find_opt(tprof[T_SEED_2], nthreads, &max, &min, &avg);
fprintf(stderr, "time_seed_2: %0.2lf s\n", avg);
find_opt(tprof[T_SEED_3], nthreads, &max, &min, &avg);
fprintf(stderr, "time_seed_3: %0.2lf s\n", avg);
fprintf(stderr, "\n[steps in chain]\n");
find_opt(tprof[T_GEN_CHAIN], nthreads, &max, &min, &avg);
fprintf(stderr, "time_gen_chain: %0.2lf s\n", avg);
find_opt(tprof[T_FLT_CHAIN], nthreads, &max, &min, &avg);
fprintf(stderr, "time_flt_chain: %0.2lf s\n", avg);
find_opt(tprof[T_FLT_CHANNED_SEEDS], nthreads, &max, &min, &avg);
fprintf(stderr, "time_flt_chained_seeds: %0.2lf s\n", avg);
find_opt(tprof[T_SAL], nthreads, &max, &min, &avg);
fprintf(stderr, "time_sal: %0.2lf s\n", avg);
find_opt(tprof[T_BSW], nthreads, &max, &min, &avg);
fprintf(stderr, "time_bsw: %0.2lf s\n", avg);
fprintf(stderr, "\n[steps in gen sam]\n");
find_opt(tprof[T_SAM_MATESW], nthreads, &max, &min, &avg);
fprintf(stderr, "time_mate_sw: %0.2lf s\n", avg);
find_opt(tprof[T_KSW_ALIGN2], nthreads, &max, &min, &avg);
fprintf(stderr, "time_ksw_align2: %0.2lf s\n", avg);
find_opt(tprof[T_KSW_LOOP], nthreads, &max, &min, &avg);
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", avg);
find_opt(tprof[T_KSW_REVERSE], nthreads, &max, &min, &avg);
fprintf(stderr, "time_ksw_reverse: %0.2lf s\n", avg);
find_opt(tprof[T_SAM_REG2ALN], nthreads, &max, &min, &avg);
fprintf(stderr, "time_reg2aln: %0.2lf s\n", avg);
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq);
fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq);
fprintf(stderr, "seq num: %ld\n", gdat[0]);
fprintf(stderr, "full num: %ld\n", gdat[1]);
fprintf(stderr, "percent: %0.2lf%c\n", (double)gdat[1] / gdat[0] * 100, '%');
fprintf(stderr, "all_match_len: %ld\n", all_match_len);
fprintf(stderr, "all_seq_num: %ld\n", all_seq_num);
fprintf(stderr, "all_type_hits: %ld\n", all_type_hits);
fprintf(stderr, "seed_time: %f\n", seed_time);
fprintf(stderr, "all_match_len: %ld\n", get_sum(tprof[T_SEED_LEN], nthreads));
#define PRINT_SEED_TIME(mark) \
find_opt(tprof[T_SEED_##mark], nthreads, &max, &min, &avg); \
fprintf(stderr, "time_seed_%s: %0.2lf s %0.2lf s %0.2lf s\n", #mark, max, min, avg);
#if 1
// PRINT_SEED_TIME(1_ALL);
// PRINT_SEED_TIME(1_0);
// PRINT_SEED_TIME(1_1);
// PRINT_SEED_TIME(1_2);
// PRINT_SEED_TIME(1_3);
PRINT_SEED_TIME(1_3_1);
// PRINT_SEED_TIME(1_3_2);
// PRINT_SEED_TIME(1_3_3);
// PRINT_SEED_TIME(1_3_4);
// PRINT_SEED_TIME(1_3_5);
// PRINT_SEED_TIME(1_3_6);
// PRINT_SEED_TIME(1_3_7);
#endif
#if 1
// PRINT_SEED_TIME(2_ALL);
// PRINT_SEED_TIME(2_0);
// PRINT_SEED_TIME(2_1);
// PRINT_SEED_TIME(2_2);
PRINT_SEED_TIME(2_2_0);
// PRINT_SEED_TIME(2_2_1);
// PRINT_SEED_TIME(2_2_2);
// PRINT_SEED_TIME(2_2_3);
#endif
#if 1
// PRINT_SEED_TIME(3_ALL);
// PRINT_SEED_TIME(3_0);
// PRINT_SEED_TIME(3_1);
// PRINT_SEED_TIME(3_2);
// PRINT_SEED_TIME(3_3);
PRINT_SEED_TIME(3_3_0);
// PRINT_SEED_TIME(3_3_1);
// PRINT_SEED_TIME(3_3_2);
#endif
double all = 0;
for (i = 0; i < 50; ++i) {
//all += sum(tdat[i], nthreads);
// fprintf(stderr, "sum %d: %ld\n", i, sum(tdat[i], nthreads));
}
for (i = 0; i < 50; ++i) {
//all += sum(tdat[i], nthreads);
// fprintf(stderr, "%d: %f\n", i, sum(tdat[i], nthreads) * 100 / all);
}
#if 0
uint64_t b64 = 0, u64 = 0;
for (i = 0; i < 256; ++i) {
uint64_t s = sum(t_sd[i], nthreads);
if (i < 64)
b64 += s;
else
u64 += s;
fprintf(stderr, "addr %d: %ld\n", i, s);
}
fprintf(stderr, "b64 %ld; u64 %ld\n", b64, u64);
#endif
// fprintf(stderr, "sum 0: %ld\n", sum(tdat[TD_SEED_1_0], nthreads));
// fprintf(stderr, "sum 1: %ld\n", sum(tdat[TD_SEED_1_1], nthreads));
// fprintf(stderr, "sum 2: %ld\n", sum(tdat[TD_SEED_1_2], nthreads));
// fprintf(stderr, "sum 3: %ld\n", sum(tdat[TD_SEED_1_3], nthreads));
// fprintf(stderr, "sum 4: %ld\n", sum(tdat[TD_SEED_1_4], nthreads));
// fprintf(stderr, "sum 5: %ld\n", sum(tdat[TD_SEED_1_5], nthreads));
// int i;
// for (i=0; i<LIM_THREAD_DATA_TYPE; ++i) {
// for (i = 1; i <= 132; ++i) {
// fprintf(stderr, "len: %d, sum: %ld\n", i, sum(tdat[i], nthreads));
// fprintf(stderr, "%ld,\n", sum(tdat[i], nthreads));
// }
fprintf(stderr, "\n");
#endif
return 1;
}

130
profiling.h 100644
View File

@ -0,0 +1,130 @@
/*
Description: profiling related data
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2024/04/06
*/
#ifndef PROFILING_H_
#define PROFILING_H_
#include <stdint.h>
#define USE_RDTSC 1
#define LIM_THREAD 128
#define LIM_THREAD_PROF_TYPE 128
#define LIM_GLOBAL_PROF_TYPE 128
#define LIM_THREAD_DATA_TYPE 256
#define LIM_GLOBAL_DATA_TYPE 256
#ifdef SHOW_PERF
extern uint64_t proc_freq;
extern uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD];
extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
#endif
#ifdef SHOW_DATA_PERF
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
extern int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD];
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
extern int64_t gd1[LIM_GLOBAL_DATA_TYPE];
#endif
#ifdef SHOW_PERF
#if USE_RDTSC
#define PROF_START(tmp_time) const uint64_t prof_tmp_##tmp_time = __rdtsc()
#define PROF_END(result, tmp_time) result += __rdtsc() - prof_tmp_##tmp_time
#else
#define PROF_START(tmp_time) uint64_t prof_tmp_##tmp_time = realtime_msec()
#define PROF_END(result, tmp_time) result += realtime_msec() - prof_tmp_##tmp_time
#endif
#else
#define PROF_START(tmp_time)
#define PROF_END(result, tmp_time)
#endif
// tdat
enum {
TD_SEED_1_0 = 0,
TD_SEED_1_1,
TD_SEED_1_2,
TD_SEED_1_3,
TD_SEED_1_4,
TD_SEED_1_5,
};
// GLOBAL
enum {
G_ALL = 0,
G_PIPELINE,
G_READ,
G_WRITE,
G_COMPUTE,
G_PREPARE,
G_LOAD_IDX,
G_MEM_PREPARE,
G_MEM_KERNEL,
G_MEM_PESTAT,
G_MEM_SAM,
G_KSW_LOOP,
G_KSW_END_LOOP,
G_read_seq
};
// THREAD
enum {
T_SEED_ALL = 0,
T_CHAIN_ALL,
T_ALN_ALL,
T_INS_SIZE,
T_SEED_1,
T_SEED_2,
T_SEED_3,
T_SAL,
T_GEN_CHAIN,
T_FLT_CHAIN,
T_FLT_CHANNED_SEEDS,
T_READ_SA,
T_BSW,
T_BSW_ALL,
T_SAM_MATESW,
T_KSW_ALIGN2,
T_KSW_REVERSE,
T_SAM_REG2ALN,
T_KSW_LOOP,
T_SEED_LEN,
T_SEED_1_ALL,
T_SEED_1_0,
T_SEED_1_1,
T_SEED_1_2,
T_SEED_1_3,
T_SEED_1_3_1,
T_SEED_1_3_2,
T_SEED_1_3_3,
T_SEED_1_3_4,
T_SEED_1_3_5,
T_SEED_1_3_6,
T_SEED_1_3_7,
T_SEED_2_ALL,
T_SEED_2_0,
T_SEED_2_1,
T_SEED_2_2,
T_SEED_2_2_0,
T_SEED_2_2_1,
T_SEED_2_2_2,
T_SEED_2_2_3,
T_SEED_3_ALL,
T_SEED_3_0,
T_SEED_3_1,
T_SEED_3_2,
T_SEED_3_3,
T_SEED_3_3_0,
T_SEED_3_3_1,
T_SEED_3_3_2
};
int display_stats(int);
#endif

4
rle.h
View File

@ -4,10 +4,14 @@
#include <stdint.h>
#ifdef __GNUC__
#ifndef LIKELY
#define LIKELY(x) __builtin_expect((x),1)
#endif
#else
#ifndef LIKELY
#define LIKELY(x) (x)
#endif
#endif
#ifdef __cplusplus
extern "C" {

67
utils.h
View File

@ -27,8 +27,10 @@
#ifndef LH3_UTILS_H
#define LH3_UTILS_H
#include <getopt.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/stat.h>
#include <zlib.h>
#ifdef __GNUC__
@ -47,6 +49,71 @@
#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg)
#define PROG_NAME "hybalign"
///////////// added for hyb-align /////////////
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#undef MAX_PATH
#define MAX_PATH 2048
#ifdef __GNUC__
#ifndef LIKELY
#define LIKELY(x) __builtin_expect((x), 1)
#endif
#ifndef UNLIKELY
#define UNLIKELY(x) __builtin_expect((x), 0)
#endif
#else
#ifndef LIKELY
#define LIKELY(x) (x)
#endif
#ifndef UNLIKELY
#define UNLIKELY(x) (x)
#endif
#endif
#define err_check_true(ret_code, right_val) \
if ((ret_code) != (right_val)) \
err_fatal("err_check", "Value not right: True-Val %d\n", right_val)
#define err_check_false(ret_code, err_val) \
if ((ret_code) == (err_val)) \
err_fatal("err_check", "Value not right: False-Val %d\n", err_val)
#define _load_file_to_data(fn, data) \
do { \
FILE* fp = NULL; \
struct stat st; \
err_check_true(stat(fn, &st), 0); \
fp = xopen(fn, "r"); \
data = (uint8_t*)malloc(st.st_size); \
err_fread_noeof(data, 1, st.st_size, fp); \
err_fclose(fp); \
} while (0)
#define _load_file_to_data_ps(prefix, suffix, data) \
do { \
char fn[MAX_PATH]; \
sprintf(fn, "%s%s", prefix, suffix); \
_load_file_to_data(fn, data); \
} while (0)
#define _store_data_to_file(fn, data, ele_size, nums) \
do { \
FILE* fp; \
fp = xopen(fn, "wb"); \
err_fwrite(data, ele_size, nums, fp); \
err_fflush(fp); \
err_fclose(fp); \
} while (0)
///////////// end of added for hyb-align /////////////
typedef struct {
uint64_t x, y;
} pair64_t;