From 70979c1b60d1b50590f69c5048f9cff4d052c349 Mon Sep 17 00:00:00 2001 From: zzh Date: Sun, 16 Nov 2025 01:37:21 +0800 Subject: [PATCH] =?UTF-8?q?hybrid-index=E5=92=8Cbwt=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E4=B8=80=E8=87=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .vscode/launch.json | 33 +- .vscode/settings.json | 3 +- Makefile | 7 +- bntseq.c | 2 + bwa.c | 236 ++++++++++-- bwa.h | 23 +- bwamem.c | 683 ++++++++++++++++++++++++++--------- bwamem.h | 72 +++- bwamem_extra.c | 8 +- bwamem_pair.c | 61 ++-- bwtindex.c | 12 +- bwtsw2_aux.c | 13 +- debug.h | 2 +- fastmap.c | 447 +++++++++++++++++------ hyb_bwa.c | 145 +++++++- hyb_idx.h | 2 +- hyb_seeding_1.c | 314 ++++++++++++++++ hyb_seeding_2.c | 208 +++++++++++ hyb_seeding_3.c | 203 +++++++++++ hyb_utils.c | 817 ++++++++++++++++++++++++++++++++++++++++++ kseq.h | 10 +- ksw.h | 7 +- ksw_extend2_avx2.c | 816 +++++++++++++++++++++++++++++++++++++++++ ksw_extend2_avx2_u8.c | 454 +++++++++++++++++++++++ main.c | 8 +- pemerge.c | 13 +- profiling.c | 2 + profiling.h | 2 + share_mem.c | 167 +++++++++ share_mem.h | 20 ++ utils.c | 115 +++++- utils.h | 14 +- yarn.c | 398 ++++++++++++++++++++ yarn.h | 139 +++++++ 35 files changed, 5047 insertions(+), 410 deletions(-) create mode 100644 ksw_extend2_avx2.c create mode 100644 ksw_extend2_avx2_u8.c create mode 100644 share_mem.c create mode 100644 share_mem.h create mode 100644 yarn.c create mode 100644 yarn.h diff --git a/.gitignore b/.gitignore index 6f553bd..b4a6f13 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ bwamem-lite test_index/ index/ orig_index/ +output/ run.sh debug.sh hybalign diff --git a/.vscode/launch.json b/.vscode/launch.json index 834cd67..bfc37a4 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,7 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ "mem", "-t", @@ -17,9 +17,9 @@ "-M", "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", - "~/data/fmt_ref/human_g1k_v37_decoy.fasta", - "./b1.fq", - "./b2.fq", + "/home/zzh/work/bioinfo/hyb-align/index/human_g1k_v37_decoy.fasta", + //"./b1.fq", + //"./b2.fq", //"./b1.fq", //"~/data/dataset/real/D1/n1.fq", //"~/data/dataset/real/D1/n2.fq", @@ -29,11 +29,11 @@ //"~/data/dataset/real/D3/n2.fq", //"~/data/dataset/real/D1/n1.fq.gz", //"~/data/dataset/real/D1/n2.fq.gz", - //"~/data/dataset/real/D3/1w1.fq", - //"~/data/dataset/real/D3/1w2.fq", + "~/data/dataset/real/D3/1w1.fq", + "~/data/dataset/real/D3/1w2.fq", "-o", "/dev/null", - //"-Z", + // "-g", ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 }, @@ -42,7 +42,7 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ "index", "~/data/reference/human_g1k_v37_decoy.fasta" @@ -54,7 +54,7 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ "buildkmer", "~/data/reference/human_g1k_v37_decoy.fasta.256.64.fmt", @@ -67,7 +67,7 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ "shm", "-Z", @@ -80,7 +80,7 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ "pac2bref", "~/data1/fmt_ref/human_g1k_v37_decoy.fasta" @@ -102,18 +102,15 @@ "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 }, { - "name": "train hybrid index", + "name": "fa2pac", "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ - "trainhybrid", - "-t", - "1", + "fa2pac", + "-f", "~/data/fmt_ref/human_g1k_v37_decoy.fasta", - "~/data/dataset/real/D1/n1.fq.gz", - "~/data/dataset/real/D1/n2.fq.gz" ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 }, diff --git a/.vscode/settings.json b/.vscode/settings.json index 21f83f3..e2a362b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -70,6 +70,7 @@ "share_mem.h": "c", "kseq.h": "c", "ostream": "c", - "streambuf": "c" + "streambuf": "c", + "kbtree.h": "c" } } \ No newline at end of file diff --git a/Makefile b/Makefile index 03bd762..626a936 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ CC= gcc #CC= clang --analyze -CFLAGS= -g -Wall -Wno-unused-function -O3 +CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O3 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -DUSE_AVX2_EXT -DSHOW_PERF -DDEBUG_FILE_OUTPUT +HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o share_mem.o yarn.o \ + ksw_extend2_avx2.o ksw_extend2_avx2_u8.o LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ diff --git a/bntseq.c b/bntseq.c index 6380689..f932da9 100644 --- a/bntseq.c +++ b/bntseq.c @@ -346,7 +346,9 @@ int bwa_fa2pac(int argc, char *argv[]) return 1; } fp = xzopen(argv[optind], "r"); + start_async_read(fp); bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); + stop_async_read(fp); err_gzclose(fp); return 0; } diff --git a/bwa.c b/bwa.c index e7b571c..23e3a72 100644 --- a/bwa.c +++ b/bwa.c @@ -24,16 +24,19 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include -#include -#include "bntseq.h" #include "bwa.h" -#include "ksw.h" -#include "utils.h" + +#include +#include +#include +#include +#include + +#include "bntseq.h" #include "kstring.h" +#include "ksw.h" #include "kvec.h" +#include "utils.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -57,28 +60,193 @@ static inline void trim_readno(kstring_t *s) s->l -= 2, s->s[s->l] = 0; } -static inline char *dupkstring(const kstring_t *str, int dupempty) -{ - char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL; - if (!s) return NULL; - - memcpy(s, str->s, str->l); - s[str->l] = '\0'; - return s; +static inline void dupkstring(const kstring_t* str, int dupempty, char** dstp, int* sm) { + if (!dupempty && str->l == 0) { + if (*dstp) free(*dstp); + *dstp = 0; *sm = 0; + } else if (*dstp == 0 || *sm < str->l) { + *sm = str->l; + *dstp = (char*)realloc(*dstp, str->l + 1); + } + char* s = *dstp; + if (!s) return; + memcpy(s, str->s, str->l); + s[str->l] = '\0'; } -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s, int copy_comment) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = dupkstring(&ks->name, 1); - s->comment = dupkstring(&ks->comment, 0); - s->seq = dupkstring(&ks->seq, 1); - s->qual = dupkstring(&ks->qual, 0); - s->l_seq = ks->seq.l; + dupkstring(&ks->name, 1, &s->name, &s->m_name); + if (copy_comment) dupkstring(&ks->comment, 0, &s->comment, &s->m_comment); + dupkstring(&ks->seq, 1, &s->seq, &s->m_seq); + dupkstring(&ks->qual, 0, &s->qual, &s->m_qual); + s->l_seq = ks->seq.l; } -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; +typedef struct { + kseq_t* ks; + bseq1_t* seq; + int start_pos; + int n_bound; + int copy_comment; + int ret_n; + int ret_size; + int ret_status; + int chunk_size; +} read_data_t; + +static void* thread_bseq_read(void* data) { + read_data_t* d = (read_data_t*)data; + kseq_t* ks = d->ks; + bseq1_t* seqs = d->seq; + int copy_comment = d->copy_comment; + int chunk_size = d->chunk_size; + int cur_n = 0, cur_pos = d->start_pos, size = 0; + int ret_status = 1; + + while (cur_n < d->n_bound && (ret_status = kseq_read(ks)) >= 0) { + trim_readno(&ks->name); + kseq2bseq1(ks, seqs + cur_pos, copy_comment); + seqs[cur_pos].id = cur_pos; + size += seqs[cur_pos].l_seq; + cur_pos += 2; cur_n += 1; + if (size >= chunk_size) break; + } + d->ret_n = cur_n; d->ret_size = size; d->ret_status = ret_status; + return 0; +} + +#define READ_ONE_SEQ(ksin) \ + trim_readno(&(ksin)->name); \ + kseq2bseq1(ksin, &seqs[n], copy_comment); \ + seqs[n].id = n; \ + size += seqs[n++].l_seq; + +// multi thread reading input seqs +void bseq_read_pe_mt(int chunk_size, int* n_, void* ks1_, void* ks2_, int copy_comment, int64_t* size_, int* m_, bseq1_t** seqs_ptr) { + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m = *m_, n = 0; + bseq1_t* seqs = *seqs_ptr; + read_data_t d[2]; + pthread_t tid[2]; + const int chunk_size_narrow = 4 * 1024 * 1024; + const int init_n_reads = 20; + if (m == 0) { // 还没开辟空间,要初始化 + seqs = (bseq1_t*)calloc(init_n_reads, + sizeof(bseq1_t)); // 先读取20个reads,根据reads的长度和chunk size决定要读取多少条reads +#if 1 + int ks1_ret = 0, ks2_ret = 0; + int i = init_n_reads >> 1; + while (i-- > 0) { + ks1_ret = kseq_read(ks); + if (ks1_ret < 0) + break; + ks2_ret = kseq_read(ks2); + if (ks2_ret < 0) { + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + READ_ONE_SEQ(ks); + READ_ONE_SEQ(ks2); + } + if (ks1_ret < 0 || ks2_ret < 0) { + if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + *seqs_ptr = seqs; + *size_ = size; + *m_ = n; + return; + } + m = (chunk_size + size / init_n_reads - 1) / (size / init_n_reads); +#else + m = 50000; +#endif + seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, sizeof(bseq1_t) * (m - n)); + } + + d[0].copy_comment = copy_comment; + d[1].copy_comment = copy_comment; + d[0].ks = ks; + d[0].seq = &seqs[0]; + d[0].n_bound = (m >> 1) - (n >> 1); + d[0].start_pos = n; + d[1].ks = ks2; + d[1].seq = &seqs[0]; + d[1].n_bound = (m >> 1) - (n >> 1); + d[1].start_pos = n + 1; + d[0].chunk_size = d[1].chunk_size = (chunk_size - chunk_size_narrow - size) >> 1; + + pthread_create(&tid[0], 0, thread_bseq_read, &d[0]); + pthread_create(&tid[1], 0, thread_bseq_read, &d[1]); + pthread_join(tid[0], 0); + pthread_join(tid[1], 0); + + size += d[0].ret_size + d[1].ret_size; + + // 如果两个线程读入的reads数量不一致 + if (d[0].ret_n < d[1].ret_n) { + int num_to_read = d[1].ret_n - d[0].ret_n; + int offset = n + d[0].ret_n * 2; + while (num_to_read-- > 0 && kseq_read(ks) >= 0) { + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[offset], copy_comment); + seqs[offset].id = offset; + size += seqs[offset].l_seq; + offset += 2; + } + d[0].ret_n = d[1].ret_n; + } else if (d[1].ret_n < d[0].ret_n) { + int num_to_read = d[0].ret_n - d[1].ret_n; + int offset = n + 1 + d[1].ret_n * 2; + while (num_to_read-- > 0 && kseq_read(ks2) >= 0) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[offset], copy_comment); + seqs[offset].id = offset; + size += seqs[offset].l_seq; + offset += 2; + } + d[1].ret_n = d[0].ret_n; + } + + n += d[0].ret_n + d[1].ret_n; + + if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } else if (size < chunk_size && d[0].ret_status > 0 && d[1].ret_status > 0) { + while (kseq_read(ks) >= 0) { + if (kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m ? m << 1 : 256; + seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, (m - n) * sizeof(bseq1_t)); + } + READ_ONE_SEQ(ks); + READ_ONE_SEQ(ks2); + if (size >= chunk_size && (n & 1) == 0) + break; + } + if (size == 0) { // test if the 2nd file is finished + if (kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + } + *n_ = n; + *size_ = size; + if (m > *m_) + *m_ = m; + *seqs_ptr = seqs; +} + +void bseq_read(int chunk_size, int* n_, void* ks1_, void* ks2_, int copy_comment, int64_t* size_, int* m_, bseq1_t** seqs_ptr) { + // using multi-thread reading + if (ks2_) return bseq_read_pe_mt(chunk_size, n_, ks1_, ks2_, copy_comment, size_, m_, seqs_ptr); + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; int size = 0, m, n; bseq1_t *seqs; m = n = 0; seqs = 0; @@ -91,24 +259,20 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - seqs[n].id = n; - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - seqs[n].id = n; - size += seqs[n++].l_seq; - } + READ_ONE_SEQ(ks); + if (ks2) { + READ_ONE_SEQ(ks2); + } if (size >= chunk_size && (n&1) == 0) break; } if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } - *n_ = n; - return seqs; + *n_ = n; + *size_ = size; + if (m > *m_) *m_ = m; + *seqs_ptr = seqs; } void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]) diff --git a/bwa.h b/bwa.h index 95c324b..91c2663 100644 --- a/bwa.h +++ b/bwa.h @@ -28,8 +28,11 @@ #define BWA_H_ #include + #include "bntseq.h" #include "bwt.h" +#include "kstring.h" +#include "hyb_idx.h" #define BWA_IDX_BWT 0x1 #define BWA_IDX_BNS 0x2 @@ -49,17 +52,24 @@ typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + HybridIndex *hyb; // Hybrid index - int is_shm; - int64_t l_mem; + int is_shm; + int64_t l_mem; uint8_t *mem; } bwaidx_t; typedef struct { - int l_seq, id; - char *name, *comment, *seq, *qual, *sam; + int l_seq, id; + int m_name, m_comment, m_seq, m_qual; + char *name, *comment, *seq, *qual; + kstring_t sam; } bseq1_t; +typedef struct { + kstring_t sam; +} seq_sam_t; + extern int bwa_verbose, bwa_dbg; extern char bwa_rg_id[256]; @@ -67,8 +77,9 @@ extern char bwa_rg_id[256]; extern "C" { #endif - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); + // bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bseq_read(int chunk_size, int* n_, void* ks1_, void* ks2_, int copy_comment, int64_t* size_, int* m_, bseq1_t** seqs_ptr); + void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); diff --git a/bwamem.c b/bwamem.c index bf1f925..c69ba8a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -41,6 +41,7 @@ #include "kvec.h" #include "ksort.h" #include "utils.h" +#include "hyb_idx.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -110,7 +111,7 @@ mem_opt_t *mem_opt_init() o->use_bwt = 0; o->skip_entire_match = 0; o->batch_size = 256; - + return o; } @@ -121,17 +122,23 @@ mem_opt_t *mem_opt_init() #define intv_lt(a, b) ((a).info < (b).info) KSORT_INIT(mem_intv, bwtintv_t, intv_lt) -typedef struct { - bwtintv_v mem, mem1, *tmpv[2]; -} smem_aux_t; - static smem_aux_t *smem_aux_init() { smem_aux_t *a; a = calloc(1, sizeof(smem_aux_t)); a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); - return a; + a->sw_buf = (buf_t*)calloc(1, sizeof(buf_t)); + a->seq_buf = (buf_t*)calloc(1, sizeof(buf_t)); + a->byte_seq = (byte_v*)calloc(1, sizeof(byte_v)); + a->reverse_seq = (byte_v*)calloc(1, sizeof(byte_v)); + a->for_bits = (byte_v*)calloc(1, sizeof(byte_v)); + a->back_bits = (byte_v*)calloc(1, sizeof(byte_v)); + kv_resize(uint8_t, *a->byte_seq, HYB_MAX_SEQ_LEN); + kv_resize(uint8_t, *a->reverse_seq, HYB_MAX_SEQ_LEN); + kv_resize(uint8_t, *a->for_bits, HYB_MAX_SEQ_LEN); + kv_resize(uint8_t, *a->back_bits, HYB_MAX_SEQ_LEN); + return a; } static void smem_aux_destroy(smem_aux_t *a) @@ -142,13 +149,41 @@ static void smem_aux_destroy(smem_aux_t *a) free(a); } -static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) -{ - int i, k, x = 0, old_n; +// 初始化线程需要的数据 +mem_worker_t* init_mem_worker(const mem_opt_t* opt, const bwt_t *bwt, const HybridIndex* hyb, const bntseq_t* bns, const uint8_t* pac) { + int i = opt->n_threads, j; + mem_worker_t *w = (mem_worker_t *)calloc(1, sizeof(mem_worker_t)); + w->opt = opt; w->bwt = bwt; w->hyb = hyb; w->bns = bns; w->pac = pac; + w->calc_isize = 0; w->n = 0; w->regs = 0; + w->aux = (smem_aux_t**)malloc(i * sizeof(smem_aux_t*)); + w->smem_arr = (smem_v **)malloc(i * sizeof(smem_v *)); + w->chain_arr = (mem_chain_v **)malloc(i * sizeof(mem_chain_v *)); + w->isize_arr = (uint64_v **)malloc(i * sizeof(uint64_v *)); + w->seed_arr = (HybSeedArr **)malloc(i * sizeof(HybSeedArr*)); + + for (i = 0; i < opt->n_threads; ++i) { + w->aux[i] = smem_aux_init(); + w->smem_arr[i] = (smem_v*)malloc(opt->batch_size * sizeof(smem_v)); + w->chain_arr[i] = (mem_chain_v*)malloc(opt->batch_size * sizeof(mem_chain_v)); + w->isize_arr[i] = (uint64_v *)calloc(4, sizeof(uint64_v)); + w->seed_arr[i] = (HybSeedArr *)malloc(opt->batch_size * sizeof(HybSeedArr)); + for (j = 0; j < opt->batch_size; ++j) { + kv_init(w->smem_arr[i][j].mem); + kv_init(w->smem_arr[i][j].pos_arr); + kv_init(w->chain_arr[i][j]); + kv_init(w->seed_arr[i][j]); + } + } + return w; +} + +// seeding +static void mem_collect_intv(const mem_opt_t* opt, const bwt_t* bwt, int len, const uint8_t* seq, smem_v* smem, smem_aux_t* a, int tid) { + int i, k, x = 0, old_n; int start_width = 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); - a->mem.n = 0; - // first pass: find all SMEMs + smem->mem.n = 0; + // first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); @@ -156,21 +191,21 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length if (slen >= opt->min_seed_len) - kv_push(bwtintv_t, a->mem, *p); - } + kv_push(bwtintv_t, smem->mem, *p); + } } else ++x; } // second pass: find MEMs inside a long SMEM - old_n = a->mem.n; - for (k = 0; k < old_n; ++k) { - bwtintv_t *p = &a->mem.a[k]; + old_n = smem->mem.n; + for (k = 0; k < old_n; ++k) { + bwtintv_t *p = &smem->mem.a[k]; int start = p->info>>32, end = (int32_t)p->info; if (end - start < split_len || p->x[2] > opt->split_width) continue; bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) - kv_push(bwtintv_t, a->mem, a->mem1.a[i]); - } + kv_push(bwtintv_t, smem->mem, a->mem1.a[i]); + } // third pass: LAST-like if (opt->max_mem_intv > 0) { x = 0; @@ -179,39 +214,120 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co if (1) { bwtintv_t m; x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); - if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); + if (m.x[2] > 0) kv_push(bwtintv_t, smem->mem, m); } else { // for now, we never come to this block which is slower x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) - kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + kv_push(bwtintv_t, smem->mem, a->mem1.a[i]); } } else ++x; } } // sort - ks_introsort(mem_intv, a->mem.n, a->mem.a); + ks_introsort(mem_intv, smem->mem.n, smem->mem.a); +} + +void find_smem(const mem_opt_t* opt, const bwt_t* bwt, int len, const uint8_t* seq, smem_aux_t* aux, smem_v* smemv, int tid) { + if (len < opt->min_seed_len) + return; // if the query is shorter than the seed length, no match + mem_collect_intv(opt, bwt, len, seq, smemv, aux, tid); + smemv->pos_arr.n = 0; +} + +// hybrid-index-based seeding +#define hyb_seed_lt(a, b) ((a).seed_start == (b).seed_start ? (a).seed_end < (b).seed_end : (a).seed_start < (b).seed_start) +KSORT_INIT(hyb_seed, HybSeed, hyb_seed_lt) + +static void hyb_seeding(const mem_opt_t* opt, const HybridIndex* hyb, ReadSeq* read_seq, RangeArr* read_ranges, RangeArr* seeds_ranges, + HybSeedArr* seeds, uint64_t seq_id, int tid) { + int i = 0; + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + seeds->n = 0; + + // fprintf(stderr, "seq-id: %ld\n", seq_id); + + if (seq_id == 4) { + fprintf(stderr, "seq-id: %ld\n", seq_id); + } + + // 1. seeding-1: find all SMEMs + PROF_START(seed_1); + for (i = 0; i < read_ranges->n; ++i) { + Range range = kv_A(*read_ranges, i); + if (range.len < opt->min_seed_len) + continue; + seeds_ranges->a[i].start = seeds->n; + hyb_first_seeding(hyb, read_seq, &range, opt->min_seed_len, seeds, tid); + seeds_ranges->a[i].end = seeds->n; + } + + tprof[T_SEED_LEN][tid] += seeds->n; + PROF_END(tprof[T_SEED_1][tid], seed_1); +#if 1 + // 2. seeding-2: find MEMs inside a long SMEM + PROF_START(seed_2); + int pre_pivot = 0; + int old_n = seeds->n; + int pre_start = old_n, pre_end = old_n, pre_n = old_n; + + for (i = 0; i < old_n; ++i) { + HybSeed* seed = &kv_A(*seeds, i); + int start = seed->seed_start, end = seed->seed_end; + if (end - start < split_len || seed->ref_pos_arr.n > opt->split_width) + continue; + pre_n = seeds->n; + if (seed->ref_pos_arr.n == 1) { + pre_pivot = hyb_second_seeding(hyb, read_seq, start, end, seed->read_start, seed->read_end, seed->ref_pos_arr.a[0], + seed->ref_pos_arr.n + 1, pre_pivot, pre_start, pre_end, opt->min_seed_len, seeds, tid); + pre_start = pre_n; + pre_end = seeds->n; + } else { + hyb_second_seeding(hyb, read_seq, start, end, seed->read_start, seed->read_end, seed->ref_pos_arr.a[0], seed->ref_pos_arr.n + 1, 0, 0, 0, + opt->min_seed_len, seeds, tid); + } + } + PROF_END(tprof[T_SEED_2][tid], seed_2); +#endif + +#if 1 + // 3. seeding-3: LAST-like + old_n = seeds->n; + PROF_START(seed_3); + if (opt->max_mem_intv > 0) { + for (i = 0; i < read_ranges->n; ++i) { + Range range = kv_A(*read_ranges, i); + if (range.len < opt->min_seed_len) + continue; + Range seeds_range = kv_A(*seeds_ranges, i); + hyb_third_seeding(hyb, read_seq, &range, &seeds_range, opt->min_seed_len, opt->max_mem_intv, seeds, tid); + } + } + PROF_END(tprof[T_SEED_3][tid], seed_3); +#endif +#if 0 + { + FILE *fp = gf[1]; + int j; + // fprintf(fp, "%ld ", seq_id); + for (i = 0; i < seeds->n; ++i) { + HybSeed *seed = &kv_A(*seeds, i); + fprintf(fp, "s:%d e:%d n:%ld ", seed->seed_start, seed->seed_end, seed->ref_pos_arr.n); + for (j = 0; j < seed->ref_pos_arr.n; ++j) { + fprintf(fp, "%ld ", seed->ref_pos_arr.a[j]); + } + fprintf(fp, "\n"); + } + fprintf(fp, "\n"); + // fprintf(fp, "seq_id:%ld\n", seq_id); + } +#endif + ks_introsort(hyb_seed, kv_size(*seeds), seeds->a); } /************ * Chaining * ************/ -typedef struct { - int64_t rbeg; - int32_t qbeg, len; - int score; -} mem_seed_t; // unaligned memory - -typedef struct { - int n, m, first, rid; - uint32_t w:29, kept:2, is_alt:1; - float frac_rep; - int64_t pos; - mem_seed_t *seeds; -} mem_chain_t; - -typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; - #include "kbtree.h" #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) @@ -279,30 +395,25 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) -{ - int i, b, e, l_rep; +void generate_chain(const mem_opt_t* opt, const bwt_t* bwt, const bntseq_t* bns, int len, const uint8_t* seq, bwtintv_v mem, mem_chain_v* chain, int tid) { + int i, b, e, l_rep; int64_t l_pac = bns->l_pac; - mem_chain_v chain; kbtree_t(chn) *tree; - smem_aux_t *aux; + chain->n = 0; - kv_init(chain); - if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match + if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); - aux = buf? (smem_aux_t*)buf : smem_aux_init(); - mem_collect_intv(opt, bwt, len, seq, aux); - for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep - bwtintv_t *p = &aux->mem.a[i]; + for (i = 0, b = e = l_rep = 0; i < mem.n; ++i) { // compute frac_rep + bwtintv_t *p = &mem.a[i]; int sb = (p->info>>32), se = (uint32_t)p->info; if (p->x[2] <= opt->max_occ) continue; if (sb > e) l_rep += e - b, b = sb, e = se; else e = e > se? e : se; } l_rep += e - b; - for (i = 0; i < aux->mem.n; ++i) { - bwtintv_t *p = &aux->mem.a[i]; + for (i = 0; i < mem.n; ++i) { + bwtintv_t *p = &mem.a[i]; int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive @@ -330,19 +441,78 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn } } } - if (buf == 0) smem_aux_destroy(aux); + if (chain->m < kb_size(tree)) { + kv_resize(mem_chain_t, *chain, kb_size(tree)); + } - kv_resize(mem_chain_t, chain, kb_size(tree)); +#define traverse_func(p_) (chain->a[chain->n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); +#undef traverse_func + for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len; + if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); + kb_destroy(chn, tree); +} - #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) - __kb_traverse(mem_chain_t, tree, traverse_func); - #undef traverse_func +void hyb_generate_chain(const mem_opt_t *opt, const HybridIndex *hyb, const bntseq_t *bns, int len, const uint8_t *seq, + HybSeedArr *seeds, mem_chain_v *chain, int tid) { + int i, b, e, l_rep; + int64_t l_pac = bns->l_pac; + kbtree_t(chn) * tree; + chain->n = 0; + if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match + tree = kb_init(chn, KB_DEFAULT_SIZE); + for (i = 0, b = e = l_rep = 0; i < seeds->n; ++i) { // compute frac_rep + HybSeed *seed = &kv_A(*seeds, i); + int sb = seed->seed_start, se = seed->seed_end; + if (seed->ref_pos_arr.n <= opt->max_occ) continue; + if (sb > e) l_rep += e - b, b = sb, e = se; + else e = e > se ? e : se; + } + l_rep += e - b; + for (i = 0; i < seeds->n; ++i) { + HybSeed *seed = &kv_A(*seeds, i); + int step, count; // seed length + int64_t k; + step = seed->ref_pos_arr.n > opt->max_occ ? seed->ref_pos_arr.n / opt->max_occ : 1; + for (k = count = 0; k < seed->ref_pos_arr.n && count < opt->max_occ; k += step, ++count) { + mem_chain_t tmp, *lower, *upper; + mem_seed_t s; + int rid, to_add = 0; + s.rbeg = tmp.pos = seed->ref_pos_arr.a[k]; + s.qbeg = seed->seed_start; + s.len = seed->seed_end - seed->seed_start; + s.score = s.len; + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + if (rid < 0) + continue; // bridging multiple reference sequences or1 the forward-reverse boundary; TODO: split the seed; + // don't discard it!!! + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) + to_add = 1; + } else + to_add = 1; + if (to_add) { // add the seed as a new chain + tmp.n = 1; + tmp.m = 4; + tmp.seeds = (mem_seed_t *)calloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds[0] = s; + tmp.rid = rid; + tmp.is_alt = !!bns->anns[rid].is_alt; + kb_putp(chn, tree, &tmp); + } + } + } + if (chain->m < kb_size(tree)) { + kv_resize(mem_chain_t, *chain, kb_size(tree)); + } +#define traverse_func(p_) (chain->a[chain->n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); +#undef traverse_func - for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; - if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); - - kb_destroy(chn, tree); - return chain; + for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len; + if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); + kb_destroy(chn, tree); } /******************** @@ -660,15 +830,16 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) #define MAX_BAND_TRY 2 -void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf, int tid) { int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; uint64_t *srt; + smem_aux_t* aux = (smem_aux_t*)buf; - if (c->n == 0) return; + if (c->n == 0) return; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -744,23 +915,30 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name); if (s->qbeg) { // left extension - uint8_t *rs, *qs; int qle, tle, gtle, gscore; - qs = malloc(s->qbeg); - for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; - rs = malloc(tmp); - for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; +#ifndef USE_AVX2_EXT + uint8_t *rs, *qs; + qs = malloc(s->qbeg); + for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; +#endif for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; if (bwa_verbose >= 4) { int j; - printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); - printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); + printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rseq[tmp - 1 - j]]); putchar('\n'); + printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)query[s->qbeg - 1 - j]]); putchar('\n'); } - a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } +#ifndef USE_AVX2_EXT + a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); +#else + a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, + aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); +#endif + if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query @@ -771,7 +949,9 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac a->qb = 0, a->rb = s->rbeg - gtle; a->truesc = gscore; } - free(qs); free(rs); +#ifndef USE_AVX2_EXT + free(qs); free(rs); +#endif } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension @@ -787,7 +967,11 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); } - a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); +#ifndef USE_AVX2_EXT + a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); +#else + a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); +#endif if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } @@ -1035,10 +1219,9 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a) } // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible -void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) -{ - extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); - kstring_t str; +void mem_reg2sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a, int extra_flag, const mem_aln_t* m, seq_sam_t* ss) { + extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); + // kstring_t str; kvec_t(mem_aln_t) aa; int k, l; char **XA = 0; @@ -1046,8 +1229,9 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, if (!(opt->flag & MEM_F_ALL)) XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); - str.l = str.m = 0; str.s = 0; - for (k = l = 0; k < a->n; ++k) { + // str.l = str.m = 0; str.s = 0; + ss->sam.l = 0; + for (k = l = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; if (p->score < opt->T) continue; @@ -1069,58 +1253,19 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; - mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); - } else { - for (k = 0; k < aa.n; ++k) - mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); - for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + mem_aln2sam(opt, bns, &ss->sam, s, 1, &t, 0, m); + } else { + for (k = 0; k < aa.n; ++k) mem_aln2sam(opt, bns, &ss->sam, s, aa.n, aa.a, k, m); + for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); free(aa.a); } - s->sam = str.s; + // s->sam = str.s; if (XA) { for (k = 0; k < a->n; ++k) free(XA[k]); free(XA); } } -mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) -{ - int i; - mem_chain_v chn; - mem_alnreg_v regs; - - for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so - seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - - chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, buf); - chn.n = mem_chain_flt(opt, chn.n, chn.a); - mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a); - if (bwa_verbose >= 4) mem_print_chain(bns, &chn); - - kv_init(regs); - for (i = 0; i < chn.n; ++i) { - mem_chain_t *p = &chn.a[i]; - if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); - free(chn.a[i].seeds); - } - free(chn.a); - regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a); - if (bwa_verbose >= 4) { - err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); - for (i = 0; i < regs.n; ++i) { - mem_alnreg_t *p = ®s.a[i]; - printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); - } - } - for (i = 0; i < regs.n; ++i) { - mem_alnreg_t *p = ®s.a[i]; - if (p->rid >= 0 && bns->anns[p->rid].is_alt) - p->is_alt = 1; - } - return regs; -} - mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; @@ -1193,77 +1338,267 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * return a; } -typedef struct { - const mem_opt_t *opt; - const bwt_t *bwt; - const bntseq_t *bns; - const uint8_t *pac; - const mem_pestat_t *pes; - smem_aux_t **aux; - bseq1_t *seqs; - mem_alnreg_v *regs; - int64_t n_processed; -} worker_t; - -static void worker1(void *data, long i, int tid) -{ - worker_t *w = (worker_t*)data; - if (!(w->opt->flag&MEM_F_PE)) { - if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); - w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); - } else { - if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); - w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); - if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); - w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); - } +static inline int cal_sub(const mem_opt_t* opt, mem_alnreg_v* r) { + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb ? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe ? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb ? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) + break; // significant overlap + } + } + return j < r->n ? r->a[j].score : opt->min_seed_len * opt->a; } -static void worker2(void *data, long i, int tid) +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t* dist) { + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1 ? p2 - b1 : b1 - p2; + return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3); +} + +// mem主要流程 +void mem_core_process(const mem_opt_t* opt, const bwt_t* bwt, const HybridIndex* hyb, const bntseq_t* bns, const uint8_t* pac, bseq1_t* seq_arr, + int nseq, smem_aux_t* aux, void* seed_arr, mem_chain_v* chain_arr, mem_alnreg_v* reg_arr, int calc_isize, int64_t l_pac, + uint64_v* isize, int tid) { + int i, j, l_seq; + mem_chain_v* chnp; + mem_alnreg_v* regp; + char* seq; + + if (opt->use_bwt) { + smem_v *smem_arr = (smem_v*)seed_arr; + // 1. seeding + PROF_START(seed_all); + for (i = 0; i < nseq; ++i) { + seq = seq_arr[i].seq; + l_seq = seq_arr[i].l_seq; + for (j = 0; j < l_seq; ++j) { + seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]]; + } + find_smem(opt, bwt, l_seq, (uint8_t*)seq, aux, &smem_arr[i], tid); + } + PROF_END(tprof[T_SEED_ALL][tid], seed_all); + + // 2. chain + PROF_START(chain_all); + for (i = 0; i < nseq; ++i) { + seq = seq_arr[i].seq; + l_seq = seq_arr[i].l_seq; + chnp = chain_arr + i; + PROF_START(gen_chain); + generate_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, smem_arr[i].mem, chnp, tid); + PROF_END(tprof[T_GEN_CHAIN][tid], gen_chain); + PROF_START(flt_chain); + chnp->n = mem_chain_flt(opt, chnp->n, chnp->a); + PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain); + PROF_START(flt_chained_seeds); + mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a); + PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds); + if (bwa_verbose >= 4) mem_print_chain(bns, chnp); + } + PROF_END(tprof[T_CHAIN_ALL][tid], chain_all); + } else { + HybSeedArr* seeds = (HybSeedArr*)seed_arr; + // 1. seeding + PROF_START(seed_all); + RangeArr read_ranges = {0}; + RangeArr seeds_ranges = {0}; + Range init_range = {0}; + for (i = 0; i < nseq; ++i) { + uint8_t* reverse_seq = aux->reverse_seq->a; + uint8_t* for_bits = aux->for_bits->a; + uint8_t* back_bits = aux->back_bits->a; + + read_ranges.n = 0; + seeds_ranges.n = 0; + int last_N = -1; + seq = seq_arr[i].seq; + l_seq = seq_arr[i].l_seq; + + for (j = 0; j < l_seq; ++j) { + seq[j] = (uint8_t)(seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]]); + if (seq[j] >= 4) { // N + reverse_seq[l_seq - 1 - j] = seq[j]; + if (last_N + 1 < j) { + const Range range = {last_N + 1, j, j - last_N - 1}; + kv_push(Range, read_ranges, range); + kv_push(Range, seeds_ranges, init_range); + } + last_N = j; + } else { + reverse_seq[l_seq - 1 - j] = 3 - seq[j]; + } + } + if (last_N + 1 < j) { + const Range range = {last_N + 1, j, j - last_N - 1}; + kv_push(Range, read_ranges, range); + kv_push(Range, seeds_ranges, init_range); + } + create_seq_fb_bits((uint8_t*)seq, l_seq, for_bits, back_bits); + ReadSeq read_seq = {l_seq, (uint8_t*)seq, reverse_seq, for_bits, back_bits, aux->seq_id}; + ++aux->seq_id; + hyb_seeding(opt, hyb, &read_seq, &read_ranges, &seeds_ranges, &seeds[i], aux->seq_id, tid); + } + kv_destroy(read_ranges); + kv_destroy(seeds_ranges); + PROF_END(tprof[T_SEED_ALL][tid], seed_all); + + // 2. chain + PROF_START(chain_all); + for (i = 0; i < nseq; ++i) { + seq = seq_arr[i].seq; + l_seq = seq_arr[i].l_seq; + chnp = chain_arr + i; + PROF_START(gen_chain); + hyb_generate_chain(opt, hyb, bns, l_seq, (uint8_t*)seq, &seeds[i], chnp, tid); + PROF_END(tprof[T_GEN_CHAIN][tid], gen_chain); + PROF_START(flt_chain); + chnp->n = mem_chain_flt(opt, chnp->n, chnp->a); + PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain); + PROF_START(flt_chained_seeds); + mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a); + PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds); + if (bwa_verbose >= 4) + mem_print_chain(bns, chnp); + } + PROF_END(tprof[T_CHAIN_ALL][tid], chain_all); + } + + // 3. align + PROF_START(aln_all); + for (i = 0; i < nseq; ++i) { + seq = seq_arr[i].seq; + l_seq = seq_arr[i].l_seq; + chnp = chain_arr + i; + regp = reg_arr + i; + kv_init(*regp); + + for (j = 0; j < chnp->n; ++j) { + mem_chain_t* p = &chnp->a[j]; + if (bwa_verbose >= 4) + err_printf("* ---> Processing chain(%d) <---\n", j); + mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux, tid); + free(chnp->a[j].seeds); + } + + free(chnp->a); + chnp->m = 0; + chnp->a = 0; + regp->n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regp->n, regp->a); + if (bwa_verbose >= 4) { + err_printf("* %ld chains remain after removing duplicated chains\n", regp->n); + for (j = 0; j < regp->n; ++j) { + mem_alnreg_t* p = ®p->a[j]; + printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); + } + } + for (j = 0; j < regp->n; ++j) { + mem_alnreg_t* p = ®p->a[j]; + if (p->rid >= 0 && bns->anns[p->rid].is_alt) + p->is_alt = 1; + } + } + PROF_END(tprof[T_ALN_ALL][tid], aln_all); + + // 4. calc insert size +#define MIN_RATIO 0.8 + if (calc_isize) { + PROF_START(ins_size); + for (i = 0; i < nseq >> 1; ++i) { + int dir; + int64_t is; + mem_alnreg_v* r[2]; + r[0] = (mem_alnreg_v*)®_arr[i << 1 | 0]; + r[1] = (mem_alnreg_v*)®_arr[i << 1 | 1]; + if (r[0]->n == 0 || r[1]->n == 0) + continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) + continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) + continue; + if (r[0]->a[0].rid != r[1]->a[0].rid) + continue; // not on the same chr + dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); + if (is && is <= opt->max_ins) + kv_push(uint64_t, isize[dir], is); + } + PROF_END(tprof[T_INS_SIZE][tid], ins_size); + } +} + +static void worker_smem_align(void *data, long i, int tid) { - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + mem_worker_t *w = (mem_worker_t*)data; + int start = i * w->opt->batch_size; + int end = MIN(start + w->opt->batch_size, w->n_reads); + mem_core_process(w->opt, w->bwt, w->hyb, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, + w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid); +} + +static void worker_sam(void *data, long i, int tid) +{ + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); - worker_t *w = (worker_t*)data; + mem_worker_t *w = (mem_worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); - mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); - free(w->regs[i].a); + mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]); + free(w->regs[i].a); } else { if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], &w->sams[i<<1], tid); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) -{ - extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); - worker_t w; +void mem_process_seqs(const mem_opt_t* opt, mem_worker_t* w, int64_t n_processed, int n, bseq1_t* seqs, const mem_pestat_t* pes0, seq_sam_t* sams) { + extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); mem_pestat_t pes[4]; double ctime, rtime; - int i; + int n_batch = (n + opt->batch_size - 1) / opt->batch_size; - ctime = cputime(); rtime = realtime(); - global_bns = bns; - w.regs = malloc(n * sizeof(mem_alnreg_v)); - w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; - w.seqs = seqs; w.n_processed = n_processed; - w.pes = &pes[0]; - w.aux = malloc(opt->n_threads * sizeof(smem_aux_t*)); - for (i = 0; i < opt->n_threads; ++i) - w.aux[i] = smem_aux_init(); - kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions - for (i = 0; i < opt->n_threads; ++i) - smem_aux_destroy(w.aux[i]); - free(w.aux); + ctime = cputime(); rtime = realtime(); + global_bns = w->bns; + + w->opt = opt; + if (w->n < n) { + w->n = n; + w->regs = (mem_alnreg_v*)realloc(w->regs, n * sizeof(mem_alnreg_v)); + } + w->seqs = seqs; + w->n_processed = n_processed; + w->sams = sams; + w->n_reads = n; + w->pes = &pes[0]; + + if ((opt->flag & MEM_F_PE) && !pes0) { // infer insert sizes if not provided + int i, j; + w->calc_isize = 1; + for (i = 0; i < opt->n_threads; ++i) + for (j = 0; j < 4; ++j) w->isize_arr[i][j].n = 0; + } + + PROF_START(kernel); + kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions + PROF_END(gprof[G_MEM_KERNEL], kernel); + + PROF_START(pestat); if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 - else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data + else mem_pestat(opt, w->bns->l_pac, n, w->isize_arr, pes); // otherwise, infer the insert size distribution from data } - kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - free(w.regs); + PROF_END(gprof[G_MEM_PESTAT], pestat); + + PROF_START(mem_sam); + kt_for(opt->n_threads, worker_sam, w, (opt->flag & MEM_F_PE) ? n >> 1 : n); // generate alignment + PROF_END(gprof[G_MEM_SAM], mem_sam); + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); } diff --git a/bwamem.h b/bwamem.h index 6451640..3d42c39 100644 --- a/bwamem.h +++ b/bwamem.h @@ -27,9 +27,11 @@ #ifndef BWAMEM_H_ #define BWAMEM_H_ -#include "bwt.h" #include "bntseq.h" #include "bwa.h" +#include "bwt.h" +#include "hyb_idx.h" +#include "utils.h" #define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 @@ -126,9 +128,68 @@ typedef struct { // This struct is only used for the convenience of API. int score, sub, alt_sc; } mem_aln_t; +typedef struct { + int64_t rbeg; + int32_t qbeg, len; + int score; +} mem_seed_t; // unaligned memory + +typedef struct { + int n, m, first, rid; + uint32_t w : 29, kept : 2, is_alt : 1; + float frac_rep; + int64_t pos; + mem_seed_t* seeds; +} mem_chain_t; + +typedef struct { + size_t n, m; + mem_chain_t* a; +} mem_chain_v; + +typedef kvec_t(uint8_t) byte_v; +typedef kvec_t(byte_v) byte_vv; + +typedef struct { + bwtintv_v mem, mem1, *tmpv[2]; + buf_t *sw_buf, *seq_buf; + byte_v* byte_seq; + byte_v* reverse_seq; + byte_v* for_bits; + byte_v* back_bits; + uint64_t seq_id; +} smem_aux_t; + +typedef struct { + bwtintv_v mem; + uint64_v pos_arr; +} smem_v; + +typedef struct { + int calc_isize; + const mem_opt_t* opt; + const bwt_t* bwt; + const HybridIndex* hyb; + const bntseq_t* bns; + const uint8_t* pac; + const mem_pestat_t* pes; + smem_aux_t** aux; + bseq1_t* seqs; + seq_sam_t* sams; + smem_v** smem_arr; + HybSeedArr** seed_arr; + mem_chain_v** chain_arr; + mem_alnreg_v* regs; + uint64_v** isize_arr; + int64_t n_processed; + int64_t n; + int64_t n_reads; +} mem_worker_t; + #ifdef __cplusplus extern "C" { #endif + mem_worker_t *init_mem_worker(const mem_opt_t *opt, const bwt_t *bwt, const HybridIndex *hyb, const bntseq_t *bns, const uint8_t *pac); smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); @@ -161,9 +222,10 @@ extern "C" { * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + // void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + void mem_process_seqs(const mem_opt_t* opt, mem_worker_t* w, int64_t n_processed, int n, bseq1_t* seqs, const mem_pestat_t* pes0, seq_sam_t* sams); - /** + /** * Find the aligned regions for one query sequence * * Note that this routine does not generate CIGAR. CIGAR should be @@ -207,10 +269,10 @@ extern "C" { * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair * @param pes inferred insert size distribution (output) */ - void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + void mem_pestat(const mem_opt_t* opt, int64_t l_pac, int n, uint64_v** isize_arr, mem_pestat_t pes[4]); #ifdef __cplusplus -} + } #endif #endif diff --git a/bwamem_extra.c b/bwamem_extra.c index c47b93f..e43c3a7 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -101,14 +101,14 @@ const bwtintv_v *smem_next(smem_i *itr) mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) { // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence - extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); +// extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); - mem_alnreg_v ar; + mem_alnreg_v ar = {0,0,0}; char *seq; seq = malloc(l_seq); memcpy(seq, seq_, l_seq); // makes a copy of seq_ - ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0); - mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); +// ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0); +// mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); free(seq); return ar; } diff --git a/bwamem_pair.c b/bwamem_pair.c index ef79521..81269cc 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -69,26 +69,19 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; } -void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) -{ - int i, d, max; +void mem_pestat(const mem_opt_t* opt, int64_t l_pac, int n, uint64_v** isize_arr, mem_pestat_t pes[4]) { + int i, j, d, max; uint64_v isize[4]; memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); - for (i = 0; i < n>>1; ++i) { - int dir; - int64_t is; - mem_alnreg_v *r[2]; - r[0] = (mem_alnreg_v*)®s[i<<1|0]; - r[1] = (mem_alnreg_v*)®s[i<<1|1]; - if (r[0]->n == 0 || r[1]->n == 0) continue; - if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; - if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; - if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr - dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); - if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); - } - if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + for (i = 0; i < opt->n_threads; ++i) { + for (d = 0; d < 4; ++d) { + for (j = 0; j < isize_arr[i][d].n; ++j) { + kv_push(uint64_t, isize[d], isize_arr[i][d].a[j]); + } + } + } + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; uint64_v *q = &isize[d]; @@ -273,11 +266,11 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a); #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) -int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid) { extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); - extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); + extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, seq_sam_t *ss); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2]; @@ -288,7 +281,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co memset(h, 0, sizeof(mem_aln_t) * 2); memset(g, 0, sizeof(mem_aln_t) * 2); n_aa[0] = n_aa[1] = 0; - if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment + PROF_START(matesw); + if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment mem_alnreg_v b[2]; kv_init(b[0]); kv_init(b[1]); for (i = 0; i < 2; ++i) @@ -300,7 +294,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } - n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); + PROF_END(tprof[T_SAM_MATESW][tid], matesw); + n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag & MEM_F_PRIMARY5) { mem_reorder_primary5(opt->T, &a[0]); @@ -363,8 +358,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } else XA[0] = XA[1] = 0; // write SAM for (i = 0; i < 2; ++i) { - h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); - h[i].mapq = q_se[i]; + PROF_START(reg2aln); + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); + PROF_END(tprof[T_SAM_REG2ALN][tid], reg2aln); + h[i].mapq = q_se[i]; h[i].flag |= 0x40<l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; } - mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); - mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); - if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]); + mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]); + if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); free(h[1].cigar); return n; } diff --git a/bwtindex.c b/bwtindex.c index bdf1a73..929e06c 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -266,6 +266,7 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s { // nucleotide indexing gzFile fp = xzopen(fa, "r"); + start_async_read(fp); t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); @@ -280,8 +281,9 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s //exit(0); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - err_gzclose(fp); - } + stop_async_read(fp); + err_gzclose(fp); + } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { strcpy(str, prefix); strcat(str, ".pac"); @@ -310,11 +312,13 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s } { gzFile fp = xzopen(fa, "r"); - t = clock(); + start_async_read(fp); + t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 1); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - err_gzclose(fp); + stop_async_read(fp); + err_gzclose(fp); } { bwt_t *bwt; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index d225187..9af21e3 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -732,8 +732,10 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c uint8_t *pac; bsw2seq_t *_seq; bseq1_t *bseq; + int64_t seq_size = 0; + int m = 0; - pac = calloc(bns->l_pac/4+1, 1); + pac = calloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); @@ -745,13 +747,14 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq); + while (n > 0) { int size = 0; if (n > _seq->max) { _seq->max = n; kroundup32(_seq->max); - _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); - } + _seq->seq = (bsw2seq1_t*)realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); + } _seq->n = n; for (i = 0; i < n; ++i) { bseq1_t *b = &bseq[i]; @@ -761,8 +764,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c size += p->l; } fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); - free(bseq); process_seqs(_seq, opt, bns, pac, target, is_pe); + bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq); } // free free(pac); diff --git a/debug.h b/debug.h index 68e8a3b..ba2edf8 100644 --- a/debug.h +++ b/debug.h @@ -10,7 +10,7 @@ ////////////////// for debug and test ////////////////////////// -#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件,并记录debug信息 +// #define DEBUG_FILE_OUTPUT // 打开gfp1-4文件,并记录debug信息 // #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度 // #define GET_FULL_MATCH_READ // 获取完全匹配的reads // #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量 diff --git a/fastmap.c b/fastmap.c index 705d3dd..ede9d4b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,20 +24,27 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include +#include +#include +#include #include -#include #include #include -#include -#include -#include +#include +#include + +#include "bntseq.h" #include "bwa.h" #include "bwamem.h" -#include "kvec.h" -#include "utils.h" -#include "bntseq.h" +#include "debug.h" +#include "hyb_idx.h" #include "kseq.h" +#include "kvec.h" +#include "profiling.h" +#include "share_mem.h" +#include "utils.h" +#include "yarn.h" KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; @@ -45,83 +52,242 @@ extern unsigned char nst_nt4_table[256]; void *kopen(const char *fn, int *_fd); int kclose(void *a); void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); +HybridIndex* bwa_hyb_idx_load_from_shm(const char* idx_prefix); +HybridIndex* bwa_hyb_idx_load_from_disk(const char* idx_prefix); typedef struct { - kseq_t *ks, *ks2; - mem_opt_t *opt; - mem_pestat_t *pes0; - int64_t n_processed; - int copy_comment, actual_chunk_size; - bwaidx_t *idx; -} ktp_aux_t; - -typedef struct { - ktp_aux_t *aux; - int n_seqs; - bseq1_t *seqs; + int n_seqs; + int n_sams; + int m_seqs; + int m_sams; + bseq1_t* seqs; + seq_sam_t* sams; } ktp_data_t; -static void *process(void *shared, int step, void *_data) -{ - ktp_aux_t *aux = (ktp_aux_t*)shared; - ktp_data_t *data = (ktp_data_t*)_data; - int i; - if (step == 0) { - ktp_data_t *ret; - int64_t size = 0; - ret = calloc(1, sizeof(ktp_data_t)); - ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2); - if (ret->seqs == 0) { - free(ret); - return 0; - } - if (!aux->copy_comment) - for (i = 0; i < ret->n_seqs; ++i) { - free(ret->seqs[i].comment); - ret->seqs[i].comment = 0; - } - for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq; - if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size); - return ret; - } else if (step == 1) { - const mem_opt_t *opt = aux->opt; - const bwaidx_t *idx = aux->idx; - if (opt->flag & MEM_F_SMARTPE) { - bseq1_t *sep[2]; - int n_sep[2]; - mem_opt_t tmp_opt = *opt; - bseq_classify(data->n_seqs, data->seqs, n_sep, sep); - if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); - if (n_sep[0]) { - tmp_opt.flag &= ~MEM_F_PE; - mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0); - for (i = 0; i < n_sep[0]; ++i) - data->seqs[sep[0][i].id].sam = sep[0][i].sam; - } - if (n_sep[1]) { - tmp_opt.flag |= MEM_F_PE; - mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); - for (i = 0; i < n_sep[1]; ++i) - data->seqs[sep[1][i].id].sam = sep[1][i].sam; - } - free(sep[0]); free(sep[1]); - } else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0); - aux->n_processed += data->n_seqs; - return data; - } else if (step == 2) { - for (i = 0; i < data->n_seqs; ++i) { - if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout); - free(data->seqs[i].name); free(data->seqs[i].comment); - free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam); - } - free(data->seqs); free(data); - return 0; - } - return 0; +typedef struct { + kseq_t *ks, *ks2; + mem_opt_t* opt; + mem_pestat_t* pes0; + int64_t n_processed; + int copy_comment, actual_chunk_size; + bwaidx_t* idx; + mem_worker_t* w; + int data_idx; // pingpong buffer index + ktp_data_t* data; + int wbuf_size; + char* wbuf; + volatile int read_complete; + volatile int calc_complete; + long read_idx; + long calc_idx; + long write_idx; +} ktp_aux_t; + +///////////////////// new parallel pipeline /////////////////// + +// read +static inline void* read_data(ktp_aux_t* aux, ktp_data_t* data) { + PROF_START(read); + ktp_data_t* ret = aux->data + aux->data_idx; + aux->data_idx = !aux->data_idx; + int64_t size = 0; + bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2, aux->copy_comment, &size, &ret->m_seqs, &ret->seqs); + PROF_END(gprof[G_READ], read); + if (ret->n_seqs == 0) { + return 0; + } + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size); + return ret; } +// calculate +static inline void* calc_data(ktp_aux_t* aux, ktp_data_t* data) { + PROF_START(compute); + const mem_opt_t* opt = aux->opt; + if (data->n_sams != data->n_seqs) { + if (data->m_sams < data->m_seqs) { + data->m_sams = data->m_seqs; + data->sams = (seq_sam_t*)realloc(data->sams, data->m_sams * sizeof(seq_sam_t)); + memset(data->sams + data->n_sams, 0, (data->m_sams - data->n_sams) * sizeof(seq_sam_t)); + } + data->n_sams = data->n_seqs; + } + if (opt->flag & MEM_F_SMARTPE) { + // 这里应该是把pair-end数据都放在一个文件里了,需要先区分,这里没有内存优化,涉及较多的开辟和释放 + int i; + bseq1_t* sep[2]; + seq_sam_t* ss[2]; + int n_sep[2]; + mem_opt_t tmp_opt = *opt; + bseq_classify(data->n_seqs, data->seqs, n_sep, sep); + ss[0] = (seq_sam_t*)calloc(0, n_sep[0] * sizeof(seq_sam_t)); + ss[1] = (seq_sam_t*)calloc(0, n_sep[1] * sizeof(seq_sam_t)); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); + if (n_sep[0]) { + tmp_opt.flag &= ~MEM_F_PE; + mem_process_seqs(&tmp_opt, aux->w, aux->n_processed, n_sep[0], sep[0], 0, ss[0]); + for (i = 0; i < n_sep[0]; ++i) data->sams[sep[0][i].id].sam = ss[0][i].sam; + } + if (n_sep[1]) { + tmp_opt.flag |= MEM_F_PE; + mem_process_seqs(&tmp_opt, aux->w, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0, ss[1]); + for (i = 0; i < n_sep[1]; ++i) data->sams[sep[1][i].id].sam = ss[1][i].sam; + } + free(sep[0]); + free(sep[1]); + free(ss[0]); + free(ss[1]); + } else + mem_process_seqs(opt, aux->w, aux->n_processed, data->n_seqs, data->seqs, aux->pes0, data->sams); + aux->n_processed += data->n_seqs; + PROF_END(gprof[G_COMPUTE], compute); + + return data; +} + +// write +static inline void* write_data(ktp_aux_t* aux, ktp_data_t* data) { + int i; + PROF_START(write); + int buf_written = 0; + for (i = 0; i < data->n_sams; ++i) { + const int slen = data->sams[i].sam.l; + if (slen && (buf_written + slen) < aux->wbuf_size) { + memcpy(&aux->wbuf[buf_written], data->sams[i].sam.s, slen); + buf_written += slen; + } else if (buf_written > 0) { + err_fwrite(aux->wbuf, 1, buf_written, stdout); + if ((buf_written + slen) >= aux->wbuf_size) { + memcpy(&aux->wbuf[0], data->sams[i].sam.s, slen); + buf_written = slen; + } else { + buf_written = 0; + } + } + } + if (buf_written > 0) { + err_fwrite(aux->wbuf, 1, buf_written, stdout); + } + PROF_END(gprof[G_WRITE], write); + return 0; +} + +// io 异步,读和写不能同时 +static void* process(void* shared, int step, void* _data) { + ktp_aux_t* aux = (ktp_aux_t*)shared; + ktp_data_t* data = (ktp_data_t*)_data; + if (step == 0) { + return read_data(aux, data); + } else if (step == 1) { + return calc_data(aux, data); + } else if (step == 2) { + return write_data(aux, data); + } + return 0; +} + +////////////// 读和写可以同时进行的pipeline +static lock_t* input_have = NULL; +static lock_t* output_have = NULL; + +static void* thread_read(void* data) { + ktp_aux_t* aux = (ktp_aux_t*)data; + while (1) { + POSSESS(input_have); + WAIT_FOR(input_have, NOT_TO_BE, 0); + RELEASE(input_have); + if (read_data(aux, aux->data) == 0) { + POSSESS(input_have); + aux->read_complete = 1; + TWIST(input_have, BY, -1); + break; + } + POSSESS(input_have); + aux->read_idx++; + TWIST(input_have, BY, -1); + } + return 0; +} + +static void* thread_calc(void* data) { + ktp_aux_t* aux = (ktp_aux_t*)data; + int d_idx = 0; + int add_idx = 0; + while (1) { + POSSESS(input_have); + WAIT_FOR(input_have, NOT_TO_BE, 2); + RELEASE(input_have); + + POSSESS(output_have); + WAIT_FOR(output_have, NOT_TO_BE, 2); + RELEASE(output_have); + + if (aux->calc_idx < aux->read_idx) { + calc_data(aux, aux->data + d_idx); + d_idx = !d_idx; + add_idx = 1; + } + if (aux->read_complete) { + POSSESS(output_have); + if (add_idx) + aux->calc_idx++; + aux->calc_complete = 1; + TWIST(output_have, BY, 1); // 最后要唤醒写线程 + break; // 计算完了 + } + POSSESS(output_have); + if (add_idx) + aux->calc_idx++; + TWIST(output_have, BY, 1); + + POSSESS(input_have); + TWIST(input_have, BY, 1); + } + return 0; +} + +static void* thread_write(void* data) { + ktp_aux_t* aux = (ktp_aux_t*)data; + int d_idx = 0; + while (1) { + POSSESS(output_have); + WAIT_FOR(output_have, NOT_TO_BE, 0); + RELEASE(output_have); + if (aux->write_idx < aux->calc_idx) { + write_data(aux, aux->data + d_idx); + d_idx = !d_idx; + aux->write_idx++; + } + if (aux->calc_complete) { + if (aux->write_idx < aux->calc_idx) + write_data(aux, aux->data + d_idx); + break; + } + POSSESS(output_have); + TWIST(output_have, BY, -1); + } + return 0; +} + +static void new_pipeline(ktp_aux_t* aux) { + input_have = NEW_LOCK(2); + output_have = NEW_LOCK(0); + pthread_t tid[3]; + int i; + + pthread_create(&tid[0], 0, thread_read, aux); + pthread_create(&tid[1], 0, thread_calc, aux); + pthread_create(&tid[2], 0, thread_write, aux); + + for (i = 0; i < 3; ++i) pthread_join(tid[i], 0); +} + +/////////////////////////////////////////////////////////////// + + + static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) { if (opt0->a) { // matching score is changed @@ -150,13 +316,28 @@ int main_mem(int argc, char *argv[]) mem_pestat_t pes[4]; ktp_aux_t aux; +#ifdef DEBUG_FILE_OUTPUT + open_debug_files(); +#endif + +#ifdef SHOW_PERF +#if USE_RDTSC + uint64_t tmp_time = __rdtsc(); + sleep(1); + proc_freq = __rdtsc() - tmp_time; +#else + proc_freq = 1000; +#endif +#endif + + PROF_START(all); memset(&aux, 0, sizeof(ktp_aux_t)); memset(pes, 0, 4 * sizeof(mem_pestat_t)); for (i = 0; i < 4; ++i) pes[i].failed = 1; aux.opt = opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:we")) >= 0) { + while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:ge")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == '1') no_mt_io = 1; else if (c == 'x') mode = optarg; @@ -256,7 +437,7 @@ int main_mem(int argc, char *argv[]) __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); } else if (c == 'b') opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1 ? opt->batch_size : 256; - else if (c == 'w') + else if (c == 'g') opt->use_bwt = 1; else if (c == 'e') opt->skip_entire_match = 1; else return 1; @@ -325,7 +506,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n"); fprintf(stderr, " -b INT batch size of reads to process at one time [%d].\n", opt->batch_size); - fprintf(stderr, " -w Use bwt index for seeding\n"); + fprintf(stderr, " -g Use bwt index for seeding\n"); fprintf(stderr, " -e Skip the second and third seeding steps for entire matching reads.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); @@ -334,6 +515,9 @@ int main_mem(int argc, char *argv[]) return 1; } + if (opt->n_threads < 1) opt->n_threads = 1; + if (opt->batch_size < 1) opt->batch_size = 256; + if (mode) { if (strcmp(mode, "intractg") == 0) { if (!opt0.o_del) opt->o_del = 16; @@ -366,22 +550,48 @@ int main_mem(int argc, char *argv[]) } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); - aux.idx = bwa_idx_load_from_shm(argv[optind]); - if (aux.idx == 0) { - if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak - } else if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__); - if (ignore_alt) + PROF_START(load_idx); + if (opt->use_bwt) { + aux.idx = bwa_idx_load_from_shm(argv[optind]); + if (aux.idx == 0) { + if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) + return 1; // FIXME: memory leak + } else if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__); + } else { // load hybrid-index + // 加载除了hyb之外,其他的必要部分 + char fn[MAX_PATH]; + FILE* fp = NULL; + uint64_t ref_len = 0; + sprintf(fn, "%s.ref-len", argv[optind]); + fp = xopen(fn, "r"); + err_check_false(fscanf(fp, "%ld", &ref_len), EOF); + err_fclose(fp); + + aux.idx = bwa_idx_load(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC); + ////////////////////////////// + aux.idx->hyb = bwa_hyb_idx_load_from_shm(argv[optind]); + if (aux.idx->hyb == 0) { + aux.idx->hyb = bwa_hyb_idx_load_from_disk(argv[optind]); + } else { + aux.idx->is_shm = 1; + } + aux.idx->hyb->ref_len = ref_len; + } + + if (ignore_alt) for (i = 0; i < aux.idx->bns->n_seqs; ++i) aux.idx->bns->anns[i].is_alt = 0; + PROF_END(gprof[G_LOAD_IDX], load_idx); - ko = kopen(argv[optind + 1], &fd); + ko = kopen(argv[optind + 1], &fd); if (ko == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); return 1; } fp = gzdopen(fd, "r"); - aux.ks = kseq_init(fp); + start_async_read(fp); // 采用双buffer技术将读取和解压overlap + aux.ks = kseq_init(fp); if (optind + 2 < argc) { if (opt->flag&MEM_F_PE) { if (bwa_verbose >= 2) @@ -393,23 +603,52 @@ int main_mem(int argc, char *argv[]) return 1; } fp2 = gzdopen(fd2, "r"); - aux.ks2 = kseq_init(fp2); + start_async_read(fp2); + aux.ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } } - bwa_print_sam_hdr(aux.idx->bns, hdr_line); + aux.w = init_mem_worker(opt, aux.idx->bwt, aux.idx->hyb, aux.idx->bns, aux.idx->pac); + aux.data = (ktp_data_t*)calloc(2, sizeof(ktp_data_t)); + // allocate write buffer + aux.wbuf_size = 16777216; + aux.wbuf = (char*)malloc(aux.wbuf_size); + + bwa_print_sam_hdr(aux.idx->bns, hdr_line); aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; - kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3); - free(hdr_line); - free(opt); - bwa_idx_destroy(aux.idx); - kseq_destroy(aux.ks); - err_gzclose(fp); kclose(ko); + + PROF_START(pipeline); + if (no_mt_io) { // 不同时读写 + kt_pipeline(2, process, &aux, 3); + } else { + new_pipeline(&aux); + } + PROF_END(gprof[G_PIPELINE], pipeline); + + // no need to free these + // free(hdr_line); + // free(opt); + // bwa_idx_destroy(aux.idx); + // kseq_destroy(aux.ks); + + stop_async_read(fp); + err_gzclose(fp); kclose(ko); if (aux.ks2) { - kseq_destroy(aux.ks2); - err_gzclose(fp2); kclose(ko2); + // kseq_destroy(aux.ks2); + stop_async_read(fp2); + err_gzclose(fp2); kclose(ko2); } - return 0; + PROF_END(gprof[G_ALL], all); + +#ifdef SHOW_PERF + display_stats(opt->n_threads); +#endif + +#ifdef DEBUG_FILE_OUTPUT + close_files(); +#endif + + return 0; } int main_fastmap(int argc, char *argv[]) @@ -447,7 +686,8 @@ int main_fastmap(int argc, char *argv[]) } fp = xzopen(argv[optind + 1], "r"); - seq = kseq_init(fp); + start_async_read(fp); + seq = kseq_init(fp); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; itr = smem_itr_init(idx->bwt); smem_config(itr, min_intv, max_len, max_intv); @@ -485,6 +725,7 @@ int main_fastmap(int argc, char *argv[]) smem_itr_destroy(itr); bwa_idx_destroy(idx); kseq_destroy(seq); - err_gzclose(fp); + stop_async_read(fp); + err_gzclose(fp); return 0; } diff --git a/hyb_bwa.c b/hyb_bwa.c index 4f4c3a1..074ce9f 100644 --- a/hyb_bwa.c +++ b/hyb_bwa.c @@ -38,6 +38,7 @@ #include "utils.h" #include "kvec.h" #include "hyb_idx.h" +#include "share_mem.h" #ifdef _DIVBWT @@ -219,6 +220,61 @@ int bwa_bwt2kmer(int argc, char* argv[]) { return 0; } +// 将原始的pac转换一下,从低到高存储 +void convert_to_hyb_pac(uint8_t* old_pac, uint64_t l_pac, const char* new_pac_fn) { +#define _gp(l) ((old_pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3) + const uint64_t kPacByteNum = l_pac / 4 + 1; + uint8_t* pac = (uint8_t*)calloc(l_pac, 1); + FILE* pacFp = fopen(new_pac_fn, "wb"); + uint8_t byte_bases = 0; + uint64_t i = 0; + uint8_t* p1; + for (; i + 3 < l_pac; i += 4) { + p1 = pac + (i >> 2); + byte_bases = _gp(i) | (_gp(i + 1) << 2) | (_gp(i + 2) << 4) | (_gp(i + 3) << 6); + *p1 = byte_bases; + } + byte_bases = 0; + p1 = pac + (i >> 2); + for (uint32_t j = 0; i < l_pac; ++i, ++j) { + byte_bases |= _gp(i) << j * 2; + } + *p1 = byte_bases; + + fwrite(pac, 1, kPacByteNum, pacFp); + uint8_t ct = 0; + if (l_pac % 4 == 0) { + ct = 0; + err_fwrite(&ct, 1, 1, pacFp); + } + ct = l_pac % 4; + err_fwrite(&ct, 1, 1, pacFp); + fclose(pacFp); +} + +// 将原pac文件转为hyb需要的格式(翻转byte) +int bwa_pac2hybpac(int argc, char* argv[]) { + if (optind + 1 > argc) { + fprintf(stderr, "Usage: bwa pac2hybpac \n\n"); + return 1; + } + char fn[MAX_PATH]; + FILE* fp; + uint8_t* old_pac = NULL; + uint64_t l_pac = 0; + // fprintf(stderr, "here-1\n"); + snprintf(fn, MAX_PATH, "%s.pac", argv[optind]); + _load_file_to_data(fn, old_pac); + sprintf(fn, "%s.ref-len", argv[optind]); + fp = xopen(fn, "r"); + err_check_false(fscanf(fp, "%ld", &l_pac), EOF); + err_fclose(fp); + sprintf(fn, "%s.hyb.pac", argv[optind]); + // fprintf(stderr, "here-2\n"); + convert_to_hyb_pac(old_pac, l_pac, fn); + return 0; +} + // 创建hybrid index,并保存到文件 int bwa_bwt2hyb(int argc, char* argv[]) { int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix); @@ -238,7 +294,7 @@ int bwa_bwt2hyb(int argc, char* argv[]) { } } if (optind + 1 > argc || error) { - fprintf(stderr, "Usage: bwa bwt2hyb [Options] \n\n"); + fprintf(stderr, "Usage: bwa bwt2hyb [Options] \n\n"); fprintf(stderr, "Options: -t INT number of threads for hybrid index building [%d]\n", num_threads); fprintf(stderr, "\n"); return 1; @@ -254,6 +310,93 @@ int bwa_bwt2hyb(int argc, char* argv[]) { return 0; } +// 尝试从share memory中加载hybrid index +HybridIndex* bwa_hyb_idx_load_from_shm(const char* idx_prefix) { + char fn[MAX_PATH]; + uint8_t* ref_bits = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_PAC_SUFFIX)); + uint8_t* sa = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_SA_SUFFIX)); + uint8_t* kmer_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_KMER_SUFFIX)); + uint8_t* index_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_DATA_SUFFIX)); + if (!ref_bits || !sa || !kmer_data || !index_data) { + return NULL; + } + HybridIndex* hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex)); + hyb->ref_bits = ref_bits; + hyb->sa = sa; + hyb->kmer_data = kmer_data; + hyb->index_data = index_data; + return hyb; +} + +// 从硬盘中加载hybrid index +HybridIndex* bwa_hyb_idx_load_from_disk(const char* idx_prefix) { + char fn[MAX_PATH]; + FILE* fp = NULL; + struct stat st; + double sec_time; + +#define __load_hybrid_idx_code(suffix, data) \ + sec_time = realtime(); \ + sprintf(fn, "%s%s", idx_prefix, suffix); \ + err_check_true(stat(fn, &st), 0); \ + fp = xopen(fn, "r"); \ + data = (uint8_t*)malloc(st.st_size); \ + err_fread_noeof(data, 1, st.st_size, fp); \ + err_fclose(fp); \ + fprintf(stderr, "%s, %0.2f GB, %0.2f s\n", fn, (double)st.st_size / 1024 / 1024 / 1024, realtime() - sec_time); + + HybridIndex* hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex)); + + __load_hybrid_idx_code(HYB_PAC_SUFFIX, hyb->ref_bits); + // load hyb byte-sa + __load_hybrid_idx_code(HYB_SA_SUFFIX, hyb->sa); + // load hyb kmer data + __load_hybrid_idx_code(HYB_KMER_SUFFIX, hyb->kmer_data); + // load hyb index data + __load_hybrid_idx_code(HYB_DATA_SUFFIX, hyb->index_data); + + return hyb; +} + +// 在共享内存中处理hybrid index +int main_shm_hyb(int argc, char* argv[]) { + char c; + int clear_shm = 0; + int list_shm = 0; + int error = 0; + while ((c = getopt(argc, argv, "dl")) >= 0) { + switch (c) { + case 'd': + clear_shm = 1; + break; + case 'l': + list_shm = 1; + break; + default: + error = 1; + break; + } + } + + // fprintf(stderr, "%d %d\n", optind, argc); + + if ((optind == argc && !clear_shm && !list_shm) || error) { + fprintf(stderr, "Usage: bwa hybshm [-d|-l] [idx_prefix]\n\n"); + fprintf(stderr, "Options: -d destroy all hyb indices in shared memory\n"); + fprintf(stderr, " -l list names of indices in shared memory\n"); + fprintf(stderr, "\n"); + return 1; + } + if (list_shm) { + return list_shm_hyb_indices(); + } else if (clear_shm) { + return shm_clear_hyb(); + } + return shm_keep_hyb(argv[optind]); +} + +////////////////////////////////////////////// for test ///////////////////////////////////// + // 创建正向的kmer uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) { uint64_t qbit = 0, i; diff --git a/hyb_idx.h b/hyb_idx.h index 6d8a7db..c08d504 100644 --- a/hyb_idx.h +++ b/hyb_idx.h @@ -59,7 +59,7 @@ typedef struct { uint8_t* for_bits; // 正向序列 2-bit编码 uint8_t* back_bits; // 反向互补序列 2-bit编码 int id; // for test; - char* seqstr; + // char* seqstr; } ReadSeq; typedef kvec_t(ReadSeq) ReadSeqArr; diff --git a/hyb_seeding_1.c b/hyb_seeding_1.c index e69de29..bce7a34 100644 --- a/hyb_seeding_1.c +++ b/hyb_seeding_1.c @@ -0,0 +1,314 @@ +#include "hyb_idx.h" +#include "profiling.h" + +static int handle_hits_1(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x, + int rx, int init_match_len, uint64_t ref_pos, const int min_seed_len, HybSeedArr* seeds, + uint64_t* seeds_cap) { + int left_match = 0, right_match = 0; + both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos, &left_match, + &right_match); + if (left_match + right_match >= min_seed_len) { + ref_pos = _rev_ref(hyb, ref_pos); + __add_seed_one_pos(seed, ref_pos - right_match + 1, x - right_match + 1, x + left_match + 1); + } + return MAX(x + left_match + 1, x - right_match + 1 + min_seed_len); +} + +static int handle_hits_2(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x, + int rx, int init_match_len, uint64_t sa_pos, const int min_seed_len, HybSeedArr* seeds, + uint64_t* seeds_cap, int tid) { + int left_match_arr[2] = {0}, right_match_arr[2] = {0}; + Range mr_arr[2] = {0}; + uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, sa_pos), hyb_sa_to_ref_pos(hyb->sa, sa_pos + 1)}; + int i = 0; + for (i = 0; i < 2; ++i) { + both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos_arr[i], + &left_match_arr[i], &right_match_arr[i]); + _set_range(mr_arr[i], x - right_match_arr[i] + 1, x + left_match_arr[i] + 1); + } + if (_range_equal(mr_arr[0], mr_arr[1])) { // 相等 + if (mr_arr[0].end - mr_arr[0].start >= min_seed_len) { // 正向搜索确定ref_pos的先后顺序 + uint8_t type_hits = 0; + uint64_t offset = 0; + get_kmer_data(hyb, read_seq->for_bits, mr_arr[0].start, &type_hits, &offset); + if (type_hits == 2) { + ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset); + ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1); + __add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); + } else { // 需要反向搜索确定ref + uint32_t seq_pos = mr_arr[0].start + HYB_KMER_LEN; + uint32_t hits = type_hits; + uint64_t sa_start = 0; + uint8_t cmp_ref = 0; + // PROF_START(seed_1); + get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, + &sa_start, &cmp_ref, tid); + // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); + ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, sa_start); + ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, sa_start + 1); + __add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); + } + } + } else if (_range_cross(mr_arr[0], mr_arr[1])) { // 交叉 + if (mr_arr[0].start < mr_arr[1].start) { + __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start, + mr_arr[0].end); + __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start, + mr_arr[1].end); + } else { + __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start, + mr_arr[1].end); + __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start, + mr_arr[0].end); + } + } else { // 包含 + if (mr_arr[0].start < mr_arr[1].start || mr_arr[0].end > mr_arr[1].end) { + __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start, + mr_arr[0].end); + } else { + __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start, + mr_arr[1].end); + } + } + return MAX(MAX(mr_arr[0].end, mr_arr[1].end), MIN(mr_arr[0].start, mr_arr[1].start) + min_seed_len); +} + +static int handle_hits_much(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, int x, + const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap, int tid) { + int max_reach = x + HYB_KMER_LEN; + int right_match = 0; + uint8_t type_hits = 0; + uint64_t offset = 0; + uint64_t ref_pos = 0; + uint32_t seq_pos = x + HYB_KMER_LEN; + uint32_t hits = 0; + uint64_t sa_start = 0; + uint8_t cmp_ref = 0; + int i = 0; + + get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset); + if (type_hits == 2) { + int match_end[2] = {0}; + uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; + for (i = 0; i < 2; ++i) { + right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, + ref_pos_arr[i], &right_match); + match_end[i] = x + right_match; + } + max_reach = MAX(match_end[0], match_end[1]); + if (max_reach - x >= min_seed_len) { + if (match_end[0] == match_end[1]) { + __add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); + } else if (match_end[0] > match_end[1]) { + __add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]); + } else { + __add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]); + } + } + } else { + hits = type_hits; + // PROF_START(seed_1); + get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start, + &cmp_ref, tid); + // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); + // tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++; + if (seq_pos == read_range->end || !cmp_ref) { + max_reach = seq_pos; + if (max_reach - x >= min_seed_len) { + __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach); + int i = 0; + for (i = 1; i < hits; ++i) { + kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); + } + } + } else { + ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start); + right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos, + &right_match); + max_reach = x + right_match; + if (right_match >= min_seed_len) { + __add_seed_one_pos(seed, ref_pos, x, max_reach); + } + } + } + return MAX(max_reach, x + min_seed_len); +} + +int seeding_from_start(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len, + HybSeedArr* seeds, int tid) { + // PROF_START(seed_1); + uint64_t seeds_m = seeds->m; + uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容,则需要初始化ref_pos_arr.m, n, a 为0 + int max_reach = read_range->start + HYB_KMER_LEN; // 返回的结果,最远匹配的read的位置 + int x = read_range->start; // 从read_range的起始位置开始匹配 + uint64_t i = 0; + int right_match = 0; + uint8_t type_hits = 0; + uint64_t offset = 0; + uint64_t ref_pos = 0; + get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset); + // PROF_END(tprof[T_SEED_1_0][tid], seed_1); + if (type_hits == 0) { + // tdat[TD_SEED_1_0][tid]++; + } else if (type_hits == 1) { + // tdat[TD_SEED_1_1][tid]++; + // PROF_START(seed_1); + right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, offset, + &right_match); + max_reach = x + right_match; + if (max_reach - x >= min_seed_len) { + __add_seed_one_pos(seed, offset, x, max_reach); + seed->first_len = HYB_KMER_LEN; + } + // PROF_END(tprof[T_SEED_1_1][tid], seed_1); + } else if (type_hits == 2) { + // tdat[TD_SEED_1_2][tid]++; + // PROF_START(seed_1); + int match_end[2] = {0}; + uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; + for (i = 0; i < 2; ++i) { + right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, + ref_pos_arr[i], &right_match); + match_end[i] = x + right_match; + } + max_reach = MAX(match_end[0], match_end[1]); + if (max_reach - x >= min_seed_len) { + if (match_end[0] == match_end[1]) { + __add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); + seed->first_len = match_end[0]; + // seed->first_len = HYB_KMER_LEN; + } else if (match_end[0] > match_end[1]) { + __add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]); + seed->first_len = match_end[1]; + // seed->first_len = HYB_KMER_LEN; + } else { + __add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]); + seed->first_len = match_end[0]; + // seed->first_len = HYB_KMER_LEN; + } + } + // PROF_END(tprof[T_SEED_1_2][tid], seed_1); + } else { + // tdat[TD_SEED_1_3][tid]++; + uint32_t seq_pos = x + HYB_KMER_LEN; + uint32_t hits = type_hits; + uint64_t sa_start = 0; + uint8_t cmp_ref = 0; + // PROF_START(seed_1); + get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start, + &cmp_ref, tid); + // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); + if (seq_pos == read_range->end || !cmp_ref) { + max_reach = seq_pos; + if (max_reach - x >= min_seed_len) { + __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach); + if (hits == 1) + seed->first_len = seq_pos - x; + for (i = 1; i < hits; ++i) { + kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); + } + } + } else { + ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start); + right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos, + &right_match); + max_reach = x + right_match; + if (right_match >= min_seed_len) { + __add_seed_one_pos(seed, ref_pos, x, max_reach); + seed->first_len = seq_pos - x; + } + } + // PROF_END(tprof[T_SEED_1_3][tid], seed_1); + } + // PROF_END(tprof[T_SEED_1_ALL][tid], seed_1); + // PROF_END(tprof[T_SEED_1_1][tid], seed_1); + return MAX(max_reach, x + min_seed_len); +} + +////////////// +// 用hybrid-index来寻找smem(seeding-1),要求种子 hits >= min_hits_thres(>0) +void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len, + HybSeedArr* seeds, int tid) { + int x = seeding_from_start(hyb, read_seq, read_range, min_seed_len, seeds, tid); + int rx = 0; // 对应的反向位置 + Range rr = {read_seq->len - read_range->end, read_seq->len - read_range->start}; + uint64_t seeds_m = seeds->m; + uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容,则需要初始化ref_pos_arr.m, n, a 为0 + uint8_t type_hits = 0; + uint64_t offset = 0; + int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1; + // PROF_START(seed_1); + while (x < read_range->end) { + // 反向搜索, 此时x距离start超过16 + rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1 + // PROF_START(seed_1); + get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset); + // PROF_END(tprof[T_SEED_1_0][tid], seed_1); + if (type_hits == 0) { + x += extra_tend; + // tdat[TD_SEED_1_0][tid]++; + } else if (type_hits == 1) { + // tdat[TD_SEED_1_1][tid]++; + // PROF_START(seed_1); + x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap); + // PROF_END(tprof[T_SEED_1_1][tid], seed_1); + } else if (type_hits == 2) { + // tdat[TD_SEED_1_2][tid]++; + // PROF_START(seed_1); + x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap, tid); + // PROF_END(tprof[T_SEED_1_2][tid], seed_1); + } else { + // tdat[TD_SEED_1_3][tid]++; + // PROF_START(seed_1); + uint32_t seq_pos = rx + HYB_KMER_LEN; + uint32_t hits = type_hits; + uint64_t sa_start = 0; + uint8_t cmp_ref = 0; + get_leaf_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, &seq_pos, &hits, &sa_start, + &cmp_ref, tid); + // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); + // tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++; + // tdat[TD_SEED_1_0][tid]++; + // if (hits == 1) { + // tdat[TD_SEED_1_1][tid]++; + // } else if (hits == 2) { + // tdat[TD_SEED_1_2][tid]++; + // } else if (hits == 3) { + // tdat[TD_SEED_1_3][tid]++; + // } else if (hits == 4) { + // tdat[TD_SEED_1_4][tid]++; + // } else { + // tdat[TD_SEED_1_5][tid]++; + // } + if (seq_pos == rr.end || !cmp_ref) { + if (hits == 1) { + // PROF_START(seed_1); + x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start), + min_seed_len, seeds, seeds_cap); + // PROF_END(tprof[T_SEED_1_3_2][tid], seed_1); + } else if (hits == 2) { + // PROF_START(seed_1); + x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, sa_start, min_seed_len, seeds, + seeds_cap, tid); + // PROF_END(tprof[T_SEED_1_3_3][tid], seed_1); + } else { + // PROF_START(seed_1); + x = handle_hits_much(hyb, read_seq, read_range, x + rx - seq_pos + 1, min_seed_len, seeds, seeds_cap, tid); + // PROF_END(tprof[T_SEED_1_3_4][tid], seed_1); + } + } else { // hits == 1 + // PROF_START(seed_1); + x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start), + min_seed_len, seeds, seeds_cap); + // PROF_END(tprof[T_SEED_1_3_5][tid], seed_1); + } + // PROF_END(tprof[T_SEED_1_3][tid], seed_1); + } + } + // PROF_END(tprof[T_SEED_1_ALL][tid], seed_1); + // PROF_END(tprof[T_SEED_1_0][tid], seed_1); +} diff --git a/hyb_seeding_2.c b/hyb_seeding_2.c index e69de29..a3c4f74 100644 --- a/hyb_seeding_2.c +++ b/hyb_seeding_2.c @@ -0,0 +1,208 @@ +#include "hyb_idx.h" +#include "profiling.h" + +#define CALC_STAT 0 +// 需要给定初始化的hits和seq_pos +static void get_min_hits_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits, + uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) { + uint8_t cmp_ref_val = 0; + int is_head_node = 1; + uint8_t* cmp_ref = &cmp_ref_val; + uint8_t* prev_addr = idata; + uint32_t prev_seq_pos = *seq_pos_p; + uint32_t prev_hits = *hits_p; + uint64_t prev_sa_start = *sa_start_p; + uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); +#if CALC_STAT + if (next_addr != NULL) { + // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); + uint64_t dist = next_addr - prev_addr; + if (dist < 32) + gdat[0]++; + else if (dist < 64) + gdat[1]++; + else if (dist < 128) + gdat[2]++; + else + gdat[3]++; + } +#endif + while (next_addr != NULL && *hits_p >= min_hits && *seq_pos_p < seq_end) { + prev_addr = next_addr; + prev_seq_pos = *seq_pos_p; + prev_hits = *hits_p; + prev_sa_start = *sa_start_p; + next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); + is_head_node = 0; +#if CALC_STAT + if (next_addr != NULL) { + // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); + uint64_t dist = next_addr - prev_addr; + if (dist < 32) + gdat[0]++; + else if (dist < 64) + gdat[1]++; + else if (dist < 128) + gdat[2]++; + else + gdat[3]++; + } +#endif + } + if (*hits_p < min_hits) { + *seq_pos_p = prev_seq_pos; + *hits_p = prev_hits; + *sa_start_p = prev_sa_start; + next_addr = prev_addr; + parse_one_hyb_node_min_hits(next_addr, seq_bits, seq_bp, seq_end, min_hits, is_head_node, seq_pos_p, sa_start_p, hits_p, + tid); + } +} + +// for seeding-2 , 先反向,后正向 +int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start, + int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end, + const int min_seed_len, HybSeedArr* seeds, int tid) { + uint64_t seeds_m = seeds->m; + uint64_t* seeds_cap = &seeds_m; + int pivot = (seed_start + seed_end) >> 1; + int x = MAX(MAX(pivot, read_start + min_seed_len - 1), pre_pivot); + int rx = 0; + Range fr = {read_start, read_end}; + Range rr = {read_seq->len - read_end, read_seq->len - read_start}; + Range* read_range = &fr; + uint8_t type_hits = 0; + uint64_t offset = 0; + int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1; + int next_pivot = x; + int cur_left = 0; + int old_n = seeds->n; + int i = 0; + // PROF_START(seed_2); +#if 1 + if (pre_end > pre_start && seeds->a[pre_end - 1].seed_end > pivot) { + for (i = pre_start; i < pre_end; ++i) { + HybSeed* seed = &kv_A(*seeds, i); + if (seed->seed_end > pivot) { + __check_add_seed(new_seed); + seed = &kv_A(*seeds, i); + __copy_seed(*seed, *new_seed); + } + } + } + // PROF_END(tprof[T_SEED_2_0][tid], seed_2); +#endif + while (cur_left <= pivot && x < fr.end) { + next_pivot = x; + rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1 + // PROF_START(seed_2); + get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset); + // PROF_END(tprof[T_SEED_2_0][tid], seed_2); + if (type_hits == 0) { + cur_left = x - HYB_KMER_LEN + 2; + x += extra_tend; + } else if (type_hits == 1) { // min_hits肯定大于1 + cur_left = x - HYB_KMER_LEN + 2; + x += extra_tend; + } else if (type_hits == 2) { + // PROF_START(seed_2); + if (min_hits > 2) { + cur_left = x - HYB_KMER_LEN + 2; + x += extra_tend; + } else { + uint64_t ref_pos_arr[2]; + int left_match_arr[2] = {0}, right_match_arr[2] = {0}; + Range mr_arr[2] = {0}; + int new_x = x - HYB_KMER_LEN + 1; + get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset); + ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset); + ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1); + for (i = 0; i < 2; ++i) { + if (first_ref + new_x - seed_start == ref_pos_arr[i]) { + left_match_arr[i] = new_x - seed_start; + right_match_arr[i] = seed_end - new_x; + } else { + both_end_match(hyb, read_seq->len, &fr, read_seq->for_bits, read_seq->back_bits, new_x, HYB_KMER_LEN, + ref_pos_arr[i], &left_match_arr[i], &right_match_arr[i]); + } + _set_range(mr_arr[i], new_x - left_match_arr[i], new_x + right_match_arr[i]); + } + Range sr = {MAX(mr_arr[0].start, mr_arr[1].start), MIN(mr_arr[0].end, mr_arr[1].end)}; + if (sr.end - sr.start >= min_seed_len && sr.start <= pivot) { + __add_seed_one_pos(seed, ref_pos_arr[0] - new_x + sr.start, sr.start, sr.end); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1] - new_x + sr.start); + } + cur_left = sr.start; + x = MAX(x + 1, MAX(sr.end, cur_left + min_seed_len)); + } + // PROF_END(tprof[T_SEED_2_1][tid], seed_2); + } else { + // PROF_START(seed_2); + if (type_hits <= HYB_HIT_THRESH && type_hits < min_hits) { + cur_left = x - HYB_KMER_LEN + 2; + x += extra_tend; + } else { + uint32_t seq_pos = rx + HYB_KMER_LEN; + uint32_t hits = type_hits; + uint64_t sa_start = 0; + // PROF_START(seed_2); + get_min_hits_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, min_hits, &seq_pos, + &hits, &sa_start, tid); + // PROF_END(tprof[T_SEED_2_2_0][tid], seed_2); + // tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++; + // forward search + int new_x = x - (seq_pos - rx) + 1; + if (new_x <= pivot) { + // PROF_START(seed_2); + get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset); + // PROF_END(tprof[T_SEED_2_2_1][tid], seed_2); + if (type_hits == 2) { + // PROF_START(seed_2); + int right_match = 0; + int match_end[2] = {0}; + uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; + for (i = 0; i < 2; ++i) { + if (first_ref + new_x - seed_start == ref_pos_arr[i]) { + right_match = seed_end - new_x; + } else { + right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, new_x, + HYB_KMER_LEN, ref_pos_arr[i], &right_match); + } + match_end[i] = new_x + right_match; + } + seq_pos = MIN(match_end[0], match_end[1]); + if (seq_pos - new_x >= min_seed_len) { + __add_seed_one_pos(seed, ref_pos_arr[0], new_x, seq_pos); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); + } + // PROF_END(tprof[T_SEED_2_2_2][tid], seed_2); + } else { + hits = type_hits; + seq_pos = new_x + HYB_KMER_LEN; + // PROF_START(seed_2); + get_min_hits_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, fr.end, min_hits, + &seq_pos, &hits, &sa_start, tid); + // PROF_END(tprof[T_SEED_2_2_0][tid], seed_2); + // tdat[(seq_pos - new_x - HYB_KMER_LEN + 2) / 3][tid]++; + if (seq_pos - new_x >= min_seed_len) { + __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), new_x, seq_pos); + for (i = 1; i < hits; ++i) { + kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); + } + } + // PROF_END(tprof[T_SEED_2_2_3][tid], seed_2); + } + } + cur_left = new_x; + x = MAX(seq_pos, cur_left + min_seed_len); + // x = seq_pos; + } + // PROF_END(tprof[T_SEED_2_2][tid], seed_2); + } + } + if (old_n < seeds->n) { + next_pivot = seeds->a[seeds->n - 1].seed_end; + } + // PROF_END(tprof[T_SEED_2_ALL][tid], seed_2); + return next_pivot; +} diff --git a/hyb_seeding_3.c b/hyb_seeding_3.c index e69de29..e926b94 100644 --- a/hyb_seeding_3.c +++ b/hyb_seeding_3.c @@ -0,0 +1,203 @@ +#include "hyb_idx.h" +#include "profiling.h" + +#define CALC_STAT 0 + +static void get_seed_end_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int seed_end, + uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) { + uint8_t cmp_ref_val = 0; + int is_head_node = 1; + uint8_t* cmp_ref = &cmp_ref_val; + uint8_t* prev_addr = idata; + uint32_t prev_seq_pos = *seq_pos_p; + uint32_t prev_hits = *hits_p; + uint64_t prev_sa_start = *sa_start_p; + uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); +#if CALC_STAT + if (next_addr != NULL) { + // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); + uint64_t dist = next_addr - prev_addr; + if (dist < 32) + gdat[0]++; + else if (dist < 64) + gdat[1]++; + else if (dist < 128) + gdat[2]++; + else + gdat[3]++; + } +#endif + while (next_addr != NULL && *hits_p > 1 && (*seq_pos_p < seed_end || *hits_p >= max_hits)) { + prev_addr = next_addr; + prev_seq_pos = *seq_pos_p; + prev_hits = *hits_p; + prev_sa_start = *sa_start_p; + next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); + is_head_node = 0; +#if CALC_STAT + if (next_addr != NULL) { + // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); + uint64_t dist = next_addr - prev_addr; + if (dist < 32) + gdat[0]++; + else if (dist < 64) + gdat[1]++; + else if (dist < 128) + gdat[2]++; + else + gdat[3]++; + } +#endif + } + uint32_t hold_seq_pos = *seq_pos_p; + uint32_t hold_hits = *hits_p; + uint64_t hold_sa_start = *sa_start_p; + + if (*seq_pos_p > seed_end && *hits_p < max_hits) { + // 检查前一个节点 + *seq_pos_p = prev_seq_pos; + *hits_p = prev_hits; + *sa_start_p = prev_sa_start; + next_addr = prev_addr; + parse_one_hyb_node_max_hits(next_addr, seq_bits, seq_bp, seq_end, max_hits, seed_end - prev_seq_pos, is_head_node, + seq_pos_p, sa_start_p, hits_p, tid); + if (*hits_p >= max_hits) { + *seq_pos_p = hold_seq_pos; + *hits_p = hold_hits; + *sa_start_p = hold_sa_start; + } + } +} + +// assume max_hits > 2 +void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range, + const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid) { + if (seeds_range->start == seeds_range->end) { + return; + } + uint64_t seeds_m = seeds->m; + uint64_t* seeds_cap = &seeds_m; + int new_seed_len = min_seed_len + 1; + + int i = 0; + int right_match_arr[2] = {0}; + Range ff = *read_range; + uint8_t type_hits = 0; + uint64_t offset = 0; + + int seeds_i = seeds_range->start; + int x = read_range->start; + int x_end = x + new_seed_len; + int flag_found_x_end = 0; + int flag_i = 0; + // PROF_START(seed_3); + HybSeed s = kv_A(*seeds, seeds_i); + if (s.first_len > 0 && s.first_len < new_seed_len && s.seed_start == x && s.seed_end >= x_end && s.ref_pos_arr.n <= 2) { + __add_seed_one_pos(seed, s.ref_pos_arr.a[0], x, x_end); + if (s.ref_pos_arr.n == 2) + kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1]); + x = x_end; + } + while (x + min_seed_len < read_range->end) { + while (seeds_i < seeds_range->end && kv_A(*seeds, seeds_i).seed_end < x) ++seeds_i; + if (seeds_i == seeds_range->end) + break; + if (seeds->a[seeds_i].seed_start > x) { + x += new_seed_len; + continue; + } + x_end = x + new_seed_len; + flag_found_x_end = 0; + flag_i = 0; + for (i = seeds_i; i < seeds_range->end; ++i) { + HybSeed* s = &kv_A(*seeds, i); + if (s->seed_start >= x_end) + break; + if (s->seed_start <= x && s->seed_end >= x_end) { + flag_found_x_end = 1; // x_end点存在seed + flag_i = i; + break; + } + } + if (!flag_found_x_end) { + x = x_end; + continue; + } + + // PROF_START(seed_3); + get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset); + // PROF_END(tprof[T_SEED_3_0][tid], seed_3); + if (type_hits == 0) { + x += new_seed_len; + } else if (type_hits == 1) { + // PROF_START(seed_3); + __add_seed_one_pos(seed, offset, x, x_end); + x = x_end; + // PROF_END(tprof[T_SEED_3_1][tid], seed_3); + } else if (type_hits == 2) { + // PROF_START(seed_3); + HybSeed s = kv_A(*seeds, flag_i); + if (s.ref_pos_arr.n == 2) { + __add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end); + kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1] + x - s.seed_start); + } else { // 只有一个ref_pos + ff.end = x_end; + uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; + for (i = 0; i < 2; ++i) { + if (s.ref_pos_arr.a[0] + x - s.seed_start == ref_pos_arr[i]) { + right_match_arr[i] = MIN(s.seed_end - x, new_seed_len); + } else { + right_end_match(hyb, read_seq->len, &ff, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, + ref_pos_arr[i], &right_match_arr[i]); + } + } + if (right_match_arr[0] == right_match_arr[1]) { + if (right_match_arr[0] == new_seed_len) { + __add_seed_one_pos(seed, ref_pos_arr[0], x, x_end); + kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); + } + } else { + if (right_match_arr[0] == new_seed_len) { + __add_seed_one_pos(seed, ref_pos_arr[0], x, x_end); + } else if (right_match_arr[1] == new_seed_len) { + __add_seed_one_pos(seed, ref_pos_arr[1], x, x_end); + } + } + } + x = x_end; + // PROF_END(tprof[T_SEED_3_2][tid], seed_3); + } else { + uint32_t seq_pos = x + HYB_KMER_LEN; + uint32_t hits = type_hits; + uint64_t sa_start = 0; + // PROF_START(seed_3); + get_seed_end_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, max_hits, + x + new_seed_len, &seq_pos, &hits, &sa_start, tid); + // PROF_END(tprof[T_SEED_3_3_0][tid], seed_3); + // tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++; + if (seq_pos - x < new_seed_len) { + // PROF_START(seed_3); + HybSeed s = kv_A(*seeds, flag_i); + __add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end); + x = x_end; + // PROF_END(tprof[T_SEED_3_3_1][tid], seed_3); + } else { + // PROF_START(seed_3); + if (hits < max_hits) { + __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, seq_pos); + int i = 0; + for (i = 1; i < hits; ++i) { + kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); + } + x = seq_pos; + } else { + x = seq_pos + 1; + } + // PROF_END(tprof[T_SEED_3_3_2][tid], seed_3); + } + + // PROF_END(tprof[T_SEED_3_3][tid], seed_3); + } + } + // PROF_END(tprof[T_SEED_3_ALL][tid], seed_3); +} diff --git a/hyb_utils.c b/hyb_utils.c index e69de29..821cd38 100644 --- a/hyb_utils.c +++ b/hyb_utils.c @@ -0,0 +1,817 @@ +#include +#include +#include + +#include "hyb_idx.h" +#include "share_mem.h" +#include "utils.h" + +///////////////////////////////////////////////////// +// 使用hybrid-index的工具函数 + +// 加载hybrid index +HybridIndex* load_hybrid_idx(const char* prefix) { + HybridIndex* hyb = NULL; + hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex)); + + // return hyb; + + int prefix_len = strlen(prefix); + char* fn = (char*)malloc(prefix_len + 30); + FILE* fp = NULL; + struct stat st; + +#define __load_hybrid_idx_code(suffix, data) \ + sprintf(fn, "%s%s", prefix, suffix); \ + err_check_true(stat(fn, &st), 0); \ + fp = xopen(fn, "r"); \ + data = (uint8_t*)malloc(st.st_size); \ + err_fread_noeof(data, 1, st.st_size, fp); \ + err_fclose(fp); + + // load ref-len + sprintf(fn, "%s.ref-len", prefix); + // fprintf(stderr, "fn: %s\n", fn); + fp = xopen(fn, "r"); + err_check_false(fscanf(fp, "%ld", &hyb->ref_len), EOF); + err_fclose(fp); + // fprintf(stderr, "ref-len: %ld\n", hyb->ref_len); + + char* kmer_suffix = ".hybrid.kmer"; + char* data_suffix = ".hybrid.data"; + // char *kmer_suffix = ".hybrid.learned.kmer"; + // char *data_suffix = ".hybrid.learned.data"; + +#if 0 + // shm_clear_hyb(); + // load 2-bit ref + __load_hybrid_idx_code(".hybrid.pac", hyb->ref_bits); + // load hyb byte-sa + __load_hybrid_idx_code(".hybrid.sa", hyb->sa); + // load hyb kmer data + __load_hybrid_idx_code(kmer_suffix, hyb->kmer_data); + // load hyb index data + __load_hybrid_idx_code(data_suffix, hyb->index_data); +#else + shm_keep_hyb(prefix); + // load 2-bit ref + hyb->ref_bits = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), ".hybrid.pac")); + // load hyb byte-sa + hyb->sa = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), ".hybrid.sa")); + // load hyb kmer data + hyb->kmer_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), kmer_suffix)); + // load hyb index data + hyb->index_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), data_suffix)); +#endif + // fprintf(stderr, "文件大小为: %ld 字节, %.2f GB\n", st.st_size, (double)st.st_size / (1024 * 1024 * 1024)); + return hyb; +} + +// 创建正向反向互补bits +void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* fs, uint8_t* rs) { + if (len > 0) { + uint8_t fbp = 0, rbp = 0; + int i = 0, j = 0, idxf = 0, idxr = 0; + for (; i + 3 < len; i += 4) { + fbp = (bs[i] & 3) | (bs[i + 1] & 3) << 2 | (bs[i + 2] & 3) << 4 | (bs[i + 3] & 3) << 6; + rbp = (3 - (bs[len - i - 1] & 3)) | (3 - (bs[len - i - 2] & 3)) << 2 | (3 - (bs[len - i - 3] & 3)) << 4 | + (3 - (bs[len - i - 4] & 3)) << 6; + fs[idxf++] = fbp; + rs[idxr++] = rbp; + } + fbp = 0; + rbp = 0; + for (; i < len; ++i, ++j) { + fbp |= (bs[i] & 3) << j * 2; + rbp |= (3 - (bs[len - i - 1] & 3)) << j * 2; + } + fs[idxf++] = fbp; + rs[idxr++] = rbp; + } +} + +// 将seq和ref正向比对,看最多有多少匹配的bp,seq和ref都是2-bit编码的 +inline int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len) { + if (seq_pos >= seq_end) + return 0; + int64_t max_match_len = MIN(ref_len - ref_pos, seq_end - seq_pos); + + int ref_odd = ref_pos & 3; + int seq_odd = seq_pos & 3; + int64_t i = seq_pos; + int64_t j = ref_pos; + int match_len = 0; +///////////// +#define __forward_match_code(first_len, first_ref, first_seq, ref_bits, seq_bits) \ + uint64_t bp32ref = first_ref; \ + uint64_t bp32seq = first_seq; \ + uint64_t cmp = bp32ref ^ bp32seq; \ + if (cmp > 0) \ + return MIN(__builtin_ctzll(cmp) >> 1, max_match_len); \ + int first_cmp_len = first_len; \ + match_len = MIN(first_cmp_len, max_match_len); \ + i += first_cmp_len; \ + j += first_cmp_len; \ + seq_odd = i & 3; \ + ref_odd = j & 3; \ + for (; i + 31 < seq_end; i += 32, j += 32, match_len += 32) { \ + bp32ref = ref_bits; \ + bp32seq = seq_bits; \ + cmp = bp32ref ^ bp32seq; \ + if (cmp > 0) \ + return MIN(match_len + (__builtin_ctzll(cmp) >> 1), max_match_len); \ + } \ + if (i < seq_end) { \ + bp32ref = ref_bits; \ + bp32seq = seq_bits; \ + cmp = bp32ref ^ bp32seq; \ + if (cmp > 0) \ + return MIN(match_len + MIN(__builtin_ctzll(cmp) >> 1, seq_end - i), max_match_len); \ + match_len = max_match_len; /*match_len += seq_end - i;*/ \ + } + ///////// + if (seq_odd < ref_odd) { // 调整到ref的整数字节 + __forward_match_code(32 - ref_odd, (*(uint64_t*)&ref[j >> 2]) >> (ref_odd << 1), + (*(uint64_t*)&seq[i >> 2]) << ((ref_odd - seq_odd) << 1) >> (ref_odd << 1), + (*(uint64_t*)&ref[j >> 2]), + seq[i >> 2] >> (seq_odd << 1) | (*(uint64_t*)&seq[(i >> 2) + 1]) << ((4 - seq_odd) << 1)); + } else if (seq_odd > ref_odd) { // 调整到seq的整数字节 + __forward_match_code(32 - seq_odd, (*(uint64_t*)&ref[j >> 2]) << ((seq_odd - ref_odd) << 1) >> (seq_odd << 1), + (*(uint64_t*)&seq[i >> 2]) >> (seq_odd << 1), + ref[j >> 2] >> (ref_odd << 1) | (*(uint64_t*)&ref[(j >> 2) + 1]) << ((4 - ref_odd) << 1), + (*(uint64_t*)&seq[i >> 2])); + } else { // 可以调整到相同的64位地址进行比较了 + __forward_match_code(32 - seq_odd, (*(uint64_t*)&ref[j >> 2]) >> (seq_odd << 1), + (*(uint64_t*)&seq[i >> 2]) >> (seq_odd << 1), (*(uint64_t*)&ref[j >> 2]), + (*(uint64_t*)&seq[i >> 2])); + } + + return MIN(match_len, max_match_len); +} + +// 将seq和ref反向比对,看最多有多少匹配的bp +inline int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos) { + if (seq_pos < seq_start) + return 0; + int64_t max_match_len = MIN(ref_pos + 1, seq_pos - seq_start + 1); + int64_t i = seq_pos; + int64_t j = ref_pos; + int seq_odd = 3 - (i & 3); + int ref_odd = 3 - (j & 3); + int match_len = 0; +///////////// +#define __backward_tail_code(last_code) \ + int ext_bp = (7 - (i >> 2)) << 2; \ + uint64_t bp32ref = *(uint64_t*)(ref + (j >> 2) - 8) >> ((4 - ref_odd) << 1) | (uint64_t)ref[j >> 2] \ + << ((ref_odd + 28) << 1); \ + uint64_t bp32seq = (*(uint64_t*)seq) << ((seq_odd + ext_bp) << 1); \ + uint64_t cmp = bp32ref ^ bp32seq; \ + if (cmp > 0) \ + return MIN(match_len + MIN(__builtin_clzll(cmp) >> 1, (int)i + 1 - seq_start), max_match_len); \ + last_code + +#define __backward_match_code(first_len, first_ref, first_seq, ref_bits, seq_bits) \ + uint64_t bp32ref = first_ref; \ + uint64_t bp32seq = first_seq; \ + uint64_t cmp = bp32ref ^ bp32seq; \ + if (cmp > 0) \ + return MIN(MIN(__builtin_clzll(cmp) >> 1, (int)i + 1 - seq_start), max_match_len); \ + int first_cmp_len = first_len; \ + match_len = MIN(first_cmp_len, max_match_len); \ + i -= first_cmp_len; \ + j -= first_cmp_len; \ + seq_odd = 3 - (i & 3); \ + ref_odd = 3 - (j & 3); \ + for (; i - 31 >= 0; i -= 32, j -= 32, match_len += 32) { \ + bp32ref = ref_bits; \ + bp32seq = seq_bits; \ + cmp = bp32ref ^ bp32seq; \ + if (cmp > 0) \ + return MIN(match_len + (__builtin_clzll(cmp) >> 1), max_match_len); \ + } \ + if (i >= seq_start) { \ + __backward_tail_code(match_len = max_match_len); \ + } + //////////// + if (i < 32) { // 只需要一次比较 + __backward_tail_code(return max_match_len); + } + if (seq_odd < ref_odd) { // 调整到ref的整数字节 + __backward_match_code( + 32 - ref_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) << (ref_odd << 1), + (*(uint64_t*)&seq[(i >> 2) - 7]) >> ((ref_odd - seq_odd) << 1) << (ref_odd << 1), (*(uint64_t*)&ref[(j >> 2) - 7]), + (*(uint64_t*)&seq[(i >> 2) - 8] >> ((4 - seq_odd) << 1)) | ((uint64_t)seq[(i >> 2)] << ((seq_odd + 28) << 1))); + } else if (seq_odd > ref_odd) { // 调整到seq的整数字节 + __backward_match_code( + 32 - seq_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) >> ((seq_odd - ref_odd) << 1) << (seq_odd << 1), + (*(uint64_t*)&seq[(i >> 2) - 7]) << (seq_odd << 1), + (*(uint64_t*)&ref[(j >> 2) - 8] >> ((4 - ref_odd) << 1)) | ((uint64_t)ref[(j >> 2)] << ((ref_odd + 28) << 1)), + (*(uint64_t*)&seq[(i >> 2) - 7])); + } else { // 可以调整到相同的64位地址进行比较了 + __backward_match_code(32 - seq_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) << (seq_odd << 1), + (*(uint64_t*)&seq[(i >> 2) - 7]) << (seq_odd << 1), (*(uint64_t*)&ref[(j >> 2) - 7]), + (*(uint64_t*)&seq[(i >> 2) - 7])); + } + + return MIN(match_len, max_match_len); +} + +// 根据sa的行获取对应的ref position(小端模式) +uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row) { + const uint64_t start_byte = ((row << 5) + row) >> 3; // 存储这个sa数据的起始字节 + uint64_t val = *(uint64_t*)(sa_arr + start_byte); + val = (val >> (row & 7)) & 8589934591ULL; // 33-bits mask + return val; +} + +#define __parse_node_start_no_addr(idata) \ + *cmp_ref = 1; \ + uint32_t seq_pos = *seq_pos_p; \ + uint8_t header = *idata; \ + idata += 1; \ + uint8_t node_type = (header >> 6) & 3; \ + uint8_t hits_neq = header >> 5 & 1; \ + uint32_t hits_bytes = ((header >> 3) & 3) + 1; \ + uint32_t off_bytes = header & 7; \ + uint32_t child_ptr_bytes = hits_bytes + off_bytes; + +// 解析hyb node初始化变量信息 +#define __parse_node_start_code(idata) \ + uint8_t* addr = NULL; \ + __parse_node_start_no_addr(idata) + +// 解析单一碱基序列节点 +#define __parse_path_node_code(path_len) \ + uint32_t path_len = (header & 1); \ + path_len = path_len << 8 | *idata; \ + idata += 1; \ + int match_len = forward_match_len(seq_bits, seq_pos, seq_end, idata, 0, path_len); \ + *seq_pos_p = seq_pos + match_len; \ + if (match_len == (int)path_len) { \ + addr = idata + (((path_len << 1) + 7) >> 3); \ + if (hits_neq) { \ + *sa_start_p += 1; \ + *hits_p -= 1; \ + } \ + } else \ + *cmp_ref = 0; + +// 解析正常hyb节点 +#define __parse_child_node_code(kmer_len, mark_bytes, int_type, kmer_code, bits_count, one) \ + uint8_t kmer = kmer_code; \ + int_type mark = *(int_type*)idata; \ + int_type child_num = mark & (one << kmer); \ + if (child_num) { \ + *seq_pos_p += kmer_len; \ + uint32_t nth_child = bits_count(mark & ((one << kmer) - 1)); \ + uint8_t has_next_child = bits_count(mark >> kmer >> 1); \ + if (*seq_pos_p >= HYB_MAX_SEQ_LEN) { \ + *cmp_ref = 0; \ + } \ + if (off_bytes == HYB_LEAF_NODE) { \ + *hits_p -= nth_child + hits_neq + has_next_child; \ + *sa_start_p += nth_child + hits_neq; \ + } else { \ + if (nth_child == 0) { \ + idata += mark_bytes; \ + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \ + addr = idata + has_next_child * child_ptr_bytes; \ + *hits_p = hits_start - hits_neq; \ + *sa_start_p += hits_neq; \ + } else { \ + idata += mark_bytes + (nth_child - 1) * child_ptr_bytes; \ + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \ + uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes]; \ + addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes; \ + if (has_next_child) { \ + *hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \ + } else { \ + *hits_p -= hits_start + hits_neq; \ + } \ + *sa_start_p += hits_start; \ + } \ + } \ + } else { \ + *cmp_ref = 0; \ + } + +// 当节点不能完全匹配时候,检查是否能匹配该节点包含的部分碱基序列 +#define __parse_part_node_code(kmer_len, mark_bytes, int_type, kmer_base_code, bits_range, bits_count, one) \ + uint8_t kmer_base = kmer_base_code; \ + int_type mark = *(int_type*)idata; \ + int_type kmer_mask = ((one << bits_range) - 1) << kmer_base; \ + int_type child_num = mark & kmer_mask; \ + if (child_num) { \ + *seq_pos_p += kmer_len; \ + int_type kmer_pre_mask = (one << kmer_base) - 1; \ + uint32_t nth_child = bits_count(mark & kmer_pre_mask); \ + uint8_t has_next_child = bits_count(mark >> kmer_base >> bits_range); \ + if (off_bytes == HYB_LEAF_NODE) { \ + *hits_p -= nth_child + hits_neq + has_next_child; \ + *sa_start_p += nth_child + hits_neq; \ + } else { \ + if (nth_child == 0) { \ + idata += mark_bytes; \ + uint32_t hits_start = hits_neq; \ + if (has_next_child) { \ + child_num = bits_count(child_num); \ + *hits_p = \ + (*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \ + } else { \ + *hits_p -= hits_start; \ + } \ + *sa_start_p += hits_start; \ + } else { \ + idata += mark_bytes + (nth_child - 1) * child_ptr_bytes; \ + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \ + *sa_start_p += hits_start; \ + if (has_next_child) { \ + child_num = bits_count(child_num); \ + *hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \ + } else { \ + *hits_p -= hits_start + hits_neq; \ + } \ + } \ + } \ + } \ + *cmp_ref = 0; + +// 解析节点主要代码 +#define __parse_hyb_node_code(return_code) \ + if (node_type == HYB_BP_PATH) { \ + __parse_path_node_code(path_len); \ + } else if (node_type == HYB_BP_1) { \ + __parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1); \ + } else if (node_type == HYB_BP_2) { \ + if (seq_pos + 1 < seq_end) { \ + __parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1); \ + if (!child_num) { \ + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); \ + } \ + } else { \ + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); \ + } \ + } else { \ + if (seq_pos + 2 < seq_end) { \ + __parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2], \ + __builtin_popcountll, 1ULL); \ + if (!child_num) { \ + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, \ + __builtin_popcountll, 1ULL); \ + if (!child_num) { \ + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \ + } \ + } \ + } else if (seq_pos + 1 < seq_end) { \ + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, \ + 1ULL); \ + if (!child_num) { \ + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \ + } \ + } else { \ + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \ + } \ + } \ + /*__builtin_prefetch(addr, 0, 3); */ \ + return_code +///////// + +// 解析第一个节点, 返回后续对应的节点地址 +uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, + uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid) { + if (*seq_pos_p == seq_end) + return NULL; + __parse_node_start_code(idata); + *sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK; + idata += 5; + if (*hits_p > HYB_HIT_THRESH) { // 更新hits + *hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量 + idata += hits_bytes; + } + // __parse_hyb_node_code(return addr); + if (node_type == HYB_BP_PATH) { + __parse_path_node_code(path_len); + } else if (node_type == HYB_BP_1) { + __parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1); + } else if (node_type == HYB_BP_2) { + if (seq_pos + 1 < seq_end) { + __parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1); + if (!child_num) { + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); + } + } else { + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); + } + } else { + if (seq_pos + 2 < seq_end) { + //__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2], + // __builtin_popcountll, 1ULL); + uint8_t kmer = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2]; + uint64_t mark = *(uint64_t*)idata; + uint64_t child_num = mark & (1ULL << kmer); + if (child_num) { + *seq_pos_p += 3; + uint32_t nth_child = __builtin_popcountll(mark & ((1ULL << kmer) - 1)); + uint8_t has_next_child = __builtin_popcountll(mark >> kmer >> 1); + if (*seq_pos_p >= HYB_MAX_SEQ_LEN) { + *cmp_ref = 0; + } + if (off_bytes == HYB_LEAF_NODE) { + *hits_p -= nth_child + hits_neq + has_next_child; + *sa_start_p += nth_child + hits_neq; + } else { + if (nth_child == 0) { + idata += 8; + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; + addr = idata + has_next_child * child_ptr_bytes; + *hits_p = hits_start - hits_neq; + *sa_start_p += hits_neq; + } else { + idata += 8 + (nth_child - 1) * child_ptr_bytes; + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; + uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes]; + addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes; + if (has_next_child) { + *hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; + } else { + *hits_p -= hits_start + hits_neq; + } + *sa_start_p += hits_start; + } + } + } else { + *cmp_ref = 0; + } + + if (!child_num) { + //__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, + //__builtin_popcountll, + // 1ULL); + uint8_t kmer_base = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2; + uint64_t mark = *(uint64_t*)idata; + uint64_t kmer_mask = ((1ULL << 4) - 1) << kmer_base; + uint64_t child_num = mark & kmer_mask; + if (child_num) { + *seq_pos_p += 2; + uint64_t kmer_pre_mask = (1ULL << kmer_base) - 1; + uint32_t nth_child = __builtin_popcountll(mark & kmer_pre_mask); + uint8_t has_next_child = __builtin_popcountll(mark >> kmer_base >> 4); + if (off_bytes == HYB_LEAF_NODE) { + *hits_p -= nth_child + hits_neq + has_next_child; + *sa_start_p += nth_child + hits_neq; + } else { + if (nth_child == 0) { + idata += 8; + uint32_t hits_start = hits_neq; + if (has_next_child) { + child_num = __builtin_popcountll(child_num); + *hits_p = + (*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - + hits_start; + } else { + *hits_p -= hits_start; + } + *sa_start_p += hits_start; + } else { + idata += 8 + (nth_child - 1) * child_ptr_bytes; + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; + *sa_start_p += hits_start; + if (has_next_child) { + child_num = __builtin_popcountll(child_num); + *hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - + hits_start; + } else { + *hits_p -= hits_start + hits_neq; + } + } + } + } + *cmp_ref = 0; + if (!child_num) { + // __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + uint8_t kmer_base = seq_bp[seq_pos] << 4; + uint64_t mark = *(uint64_t*)idata; + uint64_t kmer_mask = ((1ULL << 16) - 1) << kmer_base; + uint64_t child_num = mark & kmer_mask; + if (child_num) { + *seq_pos_p += 1; + uint64_t kmer_pre_mask = (1ULL << kmer_base) - 1; + uint32_t nth_child = __builtin_popcountll(mark & kmer_pre_mask); + uint8_t has_next_child = __builtin_popcountll(mark >> kmer_base >> 16); + if (off_bytes == HYB_LEAF_NODE) { + *hits_p -= nth_child + hits_neq + has_next_child; + *sa_start_p += nth_child + hits_neq; + } else { + if (nth_child == 0) { + idata += 8; + uint32_t hits_start = hits_neq; + if (has_next_child) { + child_num = __builtin_popcountll(child_num); + *hits_p = + (*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - + hits_start; + } else { + *hits_p -= hits_start; + } + *sa_start_p += hits_start; + } else { + idata += 8 + (nth_child - 1) * child_ptr_bytes; + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; + *sa_start_p += hits_start; + if (has_next_child) { + child_num = __builtin_popcountll(child_num); + *hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - + hits_start; + } else { + *hits_p -= hits_start + hits_neq; + } + } + } + } + *cmp_ref = 0; + } + } + } else if (seq_pos + 1 < seq_end) { + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, + 1ULL); + if (!child_num) { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + } + } else { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + } + } + return addr; +} + +// 解析后续的正常节点 +uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, + uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid) { + if (*seq_pos_p == seq_end) + return NULL; + __parse_node_start_code(idata); + // __parse_hyb_node_code(return addr); + + if (node_type == HYB_BP_PATH) { + __parse_path_node_code(path_len); + } else if (node_type == HYB_BP_1) { + __parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1); + } else if (node_type == HYB_BP_2) { + if (seq_pos + 1 < seq_end) { + __parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1); + if (!child_num) { + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); + } + } else { + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); + } + } else { + if (seq_pos + 2 < seq_end) { + //__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2], + // __builtin_popcountll, 1ULL); + uint8_t kmer = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2]; + uint64_t mark = *(uint64_t*)idata; + uint64_t child_num = mark & (1ULL << kmer); + if (child_num) { + *seq_pos_p += 3; + uint32_t nth_child = __builtin_popcountll(mark & ((1ULL << kmer) - 1)); + uint8_t has_next_child = __builtin_popcountll(mark >> kmer >> 1); + if (*seq_pos_p >= HYB_MAX_SEQ_LEN) { + *cmp_ref = 0; + } + if (off_bytes == HYB_LEAF_NODE) { + *hits_p -= nth_child + hits_neq + has_next_child; + *sa_start_p += nth_child + hits_neq; + } else { + if (nth_child == 0) { + idata += 8; + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; + addr = idata + has_next_child * child_ptr_bytes; + *hits_p = hits_start - hits_neq; + *sa_start_p += hits_neq; + } else { + idata += 8 + (nth_child - 1) * child_ptr_bytes; + uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; + uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes]; + addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes; + if (has_next_child) { + *hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; + } else { + *hits_p -= hits_start + hits_neq; + } + *sa_start_p += hits_start; + } + } + } else { + *cmp_ref = 0; + } + + if (!child_num) { + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, + 1ULL); + if (!child_num) { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + } + } + } else if (seq_pos + 1 < seq_end) { + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, + 1ULL); + if (!child_num) { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + } + } else { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + } + } + return addr; +} + +void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits, + int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid) { + if (*seq_pos_p == seq_end) + return; + uint8_t cmp_ref_val = 0; + uint8_t* cmp_ref = &cmp_ref_val; + __parse_node_start_no_addr(idata); + + if (is_head) { + *sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK; + idata += 5; + if (*hits_p > HYB_HIT_THRESH) { // 更新hits + *hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量 + idata += hits_bytes; + } + } + + uint8_t* prev_idata = idata; + uint32_t prev_seq_pos = *seq_pos_p; + uint32_t prev_hits = *hits_p; + uint64_t prev_sa_start = *sa_start_p; + if (node_type == HYB_BP_2) { + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); + if (*hits_p < min_hits) { + *seq_pos_p = prev_seq_pos; + *hits_p = prev_hits; + *sa_start_p = prev_sa_start; + } + } else if (node_type == HYB_BP_3) { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + if (*hits_p < min_hits) { + *seq_pos_p = prev_seq_pos; + *hits_p = prev_hits; + *sa_start_p = prev_sa_start; + } else if (seq_pos + 1 < seq_end) { + uint32_t pp_seq_pos = prev_seq_pos; + uint32_t pp_hits = prev_hits; + uint64_t pp_sa_start = prev_sa_start; + prev_seq_pos = *seq_pos_p; + prev_hits = *hits_p; + prev_sa_start = *sa_start_p; + *seq_pos_p = pp_seq_pos; + *hits_p = pp_hits; + *sa_start_p = pp_sa_start; + + idata = prev_idata; // 恢复到上一个节点 + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, + 1ULL); + if (*hits_p < min_hits) { + *seq_pos_p = prev_seq_pos; + *hits_p = prev_hits; + *sa_start_p = prev_sa_start; + } + } + } +} + +void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp, + int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid) { + if (*seq_pos_p == seq_end) + return; + uint8_t cmp_ref_val = 0; + uint8_t* cmp_ref = &cmp_ref_val; + __parse_node_start_no_addr(idata); + + if (is_head) { + *sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK; + idata += 5; + if (*hits_p > HYB_HIT_THRESH) { // 更新hits + *hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量 + idata += hits_bytes; + } + } + + uint8_t* prev_idata = idata; + uint32_t prev_seq_pos = *seq_pos_p; + uint32_t prev_hits = *hits_p; + uint64_t prev_sa_start = *sa_start_p; + if (node_type == HYB_BP_2) { + __parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); + } else if (node_type == HYB_BP_3) { + if (min_bp == 2) { + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, + 1ULL); + } else { + __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); + if (*hits_p >= max_hits) { + *seq_pos_p = prev_seq_pos; + *hits_p = prev_hits; + *sa_start_p = prev_sa_start; + idata = prev_idata; // 恢复到上一个节点 + __parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, + 1ULL); + } + } + } else { // path node + if (min_bp > 0) + *seq_pos_p += min_bp; + } +} + +// 需要给定初始化的hits和seq_pos +#define CALC_STAT 0 +void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p, + uint64_t* sa_start_p, uint8_t* cmp_ref, int tid) { + uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); +#if CALC_STAT + uint8_t* prev_addr = idata; + if (next_addr != NULL) { + // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); + uint64_t dist = next_addr - prev_addr; + if (dist < 32) + gdat[0]++; + else if (dist < 64) + gdat[1]++; + else if (dist < 128) + gdat[2]++; + else + gdat[3]++; + } +#endif + while (next_addr != NULL && *hits_p > 1) { +#if CALC_STAT + prev_addr = next_addr; +#endif + next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); +#if CALC_STAT + if (next_addr != NULL) { + // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); + uint64_t dist = next_addr - prev_addr; + if (dist < 32) + gdat[0]++; + else if (dist < 64) + gdat[1]++; + else if (dist < 128) + gdat[2]++; + else + gdat[3]++; + } +#endif + } +} + +void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset) { + uint64_t kmer = _kmer_from_pos(seq_bits, kmer_pos); + uint8_t* kmer_data_addr = hyb->kmer_data + kmer * HYB_KMER_DATA_BYTES; + *type_hits = *kmer_data_addr & HYB_KMER_DATA_TYPE_MASK; + *offset = (*(uint64_t*)kmer_data_addr & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS; +} + +void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, + int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match) { + if (ref_pos < hyb->ref_len) { + *right_match = forward_match_len(for_bits, kmer_start + init_match_len, read_range->end, hyb->ref_bits, + ref_pos + init_match_len, hyb->ref_len); + } else { + ref_pos = (hyb->ref_len << 1) - 1 - ref_pos; + *right_match = backward_match_len(back_bits, seq_len - kmer_start - init_match_len - 1, seq_len - read_range->end, + hyb->ref_bits, ref_pos - init_match_len); + } + *right_match += init_match_len; // 包括kmer的长度 +} + +void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, + int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match) { + if (ref_pos < hyb->ref_len) { + *left_match = backward_match_len(for_bits, kmer_start - 1, read_range->start, hyb->ref_bits, ref_pos - 1); + } else { + ref_pos = (hyb->ref_len << 1) - 1 - ref_pos; + *left_match = forward_match_len(back_bits, seq_len - kmer_start, seq_len - read_range->start, hyb->ref_bits, + ref_pos + 1, hyb->ref_len); + } +} + +void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, + int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match) { + if (ref_pos < hyb->ref_len) { + *right_match = forward_match_len(for_bits, kmer_start + init_match_len, read_range->end, hyb->ref_bits, + ref_pos + init_match_len, hyb->ref_len); + *left_match = backward_match_len(for_bits, kmer_start - 1, read_range->start, hyb->ref_bits, ref_pos - 1); + } else { + ref_pos = (hyb->ref_len << 1) - 1 - ref_pos; + *right_match = backward_match_len(back_bits, seq_len - kmer_start - init_match_len - 1, seq_len - read_range->end, + hyb->ref_bits, ref_pos - init_match_len); + *left_match = forward_match_len(back_bits, seq_len - kmer_start, seq_len - read_range->start, hyb->ref_bits, + ref_pos + 1, hyb->ref_len); + } + *right_match += init_match_len; // 包括kmer的长度 +} diff --git a/kseq.h b/kseq.h index f3862c6..a2425e5 100644 --- a/kseq.h +++ b/kseq.h @@ -221,11 +221,11 @@ typedef struct __kstring_t { kstream_t *f; \ } kseq_t; -#define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT(type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(SCOPE, type_t) \ - __KSEQ_READ(SCOPE) +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16777216) /* 16384 */ \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) diff --git a/ksw.h b/ksw.h index 5d45a67..f5626c5 100644 --- a/ksw.h +++ b/ksw.h @@ -3,6 +3,8 @@ #include +#include "utils.h" + #define KSW_XBYTE 0x10000 #define KSW_XSTOP 0x20000 #define KSW_XSUBO 0x40000 @@ -106,9 +108,12 @@ extern "C" { */ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend2_avx2(int qlen, const uint8_t* query, int tlen, const uint8_t* target, int is_left, int m, const int8_t* mat, int o_del, int e_del, + int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int* _qle, int* _tle, int* _gtle, int* _gscore, + int* _max_off, buf_t* buf); #ifdef __cplusplus -} + } #endif #endif diff --git a/ksw_extend2_avx2.c b/ksw_extend2_avx2.c new file mode 100644 index 0000000..4531cae --- /dev/null +++ b/ksw_extend2_avx2.c @@ -0,0 +1,816 @@ +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "debug.h" + +#define ELIMINATE_DIFF_1 +// #define ELIMINATE_DIFF_3 + +#define NO_VAL -1 + +#define SIMD_WIDTH 16 + +extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, + int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); + +int ksw_extend2_origin(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, + int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); + +static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { + {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff} +}; + +#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) + +// 初始化变量 +#define SIMD_INIT \ + int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ + __m256i zero_vec; \ + __m256i max_vec; \ + __m256i oe_del_vec; \ + __m256i oe_ins_vec; \ + __m256i e_del_vec; \ + __m256i e_ins_vec; \ + __m256i h_vec_mask[SIMD_WIDTH]; \ + zero_vec = _mm256_setzero_si256(); \ + oe_del_vec = _mm256_set1_epi16(-oe_del); \ + oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ + e_del_vec = _mm256_set1_epi16(-e_del); \ + e_ins_vec = _mm256_set1_epi16(-e_ins); \ + __m256i match_sc_vec = _mm256_set1_epi16(a); \ + __m256i mis_sc_vec = _mm256_set1_epi16(-b); \ + __m256i amb_sc_vec = _mm256_set1_epi16(-1); \ + __m256i amb_vec = _mm256_set1_epi16(4); \ + for (i=0; i 0 && m >= max) { \ + for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) { \ + int pos = SIMD_WIDTH - 1 - (( __builtin_clz(mask)) >> 1); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + /*if (m >= max) fprintf(stderr, "%d %d %d %d %d %d %d\n", iend, beg, mi, mj, mask, pos, m);*/ \ + } \ + } \ + } + +// 每轮迭代后,交换数组 +#define SWAP_DATA_POINTER \ + int16_t * tmp=hA0; \ + hA0 = hA1; hA1 = hA2; hA2 = tmp; \ + tmp = eA1; eA1 = eA2; eA2 = tmp; \ + tmp = fA1; fA1 = fA2; fA2 = tmp; \ + tmp = mA1; mA1 = mA2; mA2 = tmp; + +static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum) +{ +#ifdef DEBUG_FILE_OUTPUT + // 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen + FILE *query_f = gfq[fnum], + *target_f = gft[fnum], + *info_f = gfi[fnum]; + const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'}; + int i; + // 处理query + for (i = 0; i < qlen; ++i) + fprintf(query_f, "%c", seq_map[query[i]]); + fprintf(query_f, "\n"); + // 处理target + for (i = 0; i < tlen; ++i) + fprintf(target_f, "%c", seq_map[target[i]]); + fprintf(target_f, "\n"); + // 处理其他信息 + fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0); +#endif +} + +int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int is_left, // 是不是向左扩展 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数SIMD_BTYES + int a, // 碱基match时的分数 + int b, // 碱基mismatch时的惩罚分数(正数) + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值 + buf_t *buf) // 之前已经开辟过的缓存 +{ + // return ksw_extend2_origin(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off); + +#ifdef DEBUG_FILE_OUTPUT + //fprintf(gf[0], "%d\n", qlen); +#ifdef GET_DIFFERENT_EXTENSION_LENGTH + if (qlen <= 30) { + write_query_target_sequence(qlen, query, tlen, target, h0, 0); + } else if (qlen < 60) { + write_query_target_sequence(qlen, query, tlen, target, h0, 1); + } else if (qlen < 90) { + write_query_target_sequence(qlen, query, tlen, target, h0, 2); + } else { + write_query_target_sequence(qlen, query, tlen, target, h0, 3); + } +#endif +#endif + + if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf); + + int16_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M + int16_t *seq, *ref; + uint8_t *mem; + int16_t *qtmem, *vmem; + int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; + int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int Dloop = tlen + qlen; // 循环跳出条件 + int span, beg1, end1; // 边界条件计算 + int col_size = qlen + 2 + SIMD_WIDTH; + int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 + int mem_size = (seq_size + ref_size) * 2 + val_mem_size; + + SIMD_INIT; // 初始化simd用的数据 + + assert(h0 > 0); + + // allocate memory + //mem = malloc(mem_size); + + if (buf->m < mem_size) { + buf->m = mem_size; + buf->addr = (uint8_t *)realloc(buf->addr, mem_size); + } + mem = buf->addr; + + qtmem = (int16_t *)&mem[0]; + seq=&qtmem[0]; ref=&qtmem[seq_size]; + if (is_left) { + for (i=0; i>1); i+=SIMD_WIDTH) { + _mm256_storeu_si256((__m256i*)&vmem[i], zero_vec); + } + hA = &vmem[0]; + mA = &vmem[col_size * 3]; + eA = &vmem[col_size * 5]; + fA = &vmem[col_size * 7]; + + hA0 = &hA[0]; hA1 = &hA[col_size]; hA2 = &hA1[col_size]; + mA1 = &mA[0]; mA2 = &mA[col_size]; + eA1 = &eA[0]; eA2 = &eA[col_size]; + fA1 = &fA[0]; fA2 = &fA[col_size]; + + // adjust $w if it is too large + k = m * m; + // get the max score + for (i = 0, max = 0; i < k; ++i) max = max > mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + if (tlen < qlen) w = MIN(tlen - 1, w); + + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;; + max_off = 0; + beg = 1; end = qlen; + // init h0 + hA0[0] = h0; // 左上角 + + if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 + if (w >= qlen) { max_ie = 0; gscore = 0; } + + int m_last=0; + int iend; + +#ifdef ELIMINATE_DIFF_1 + int midx = 1, icheck = 0, checkspecial = 1; + int m3 = 0, m2 = 0, m1 = 0; + //int marr[10] = {0}; + //int marr[b]; memset(marr, 0, 4 * b); +#endif + + //int print_flag = 0; //(qlen == 64 && tlen == 123); +#ifdef DEBUG_SW_EXTEND + int dii, djj; + int16_t ins[tlen + 1][qlen + 2]; + int16_t del[tlen + 1][qlen + 2]; + int16_t score[tlen + 1][qlen + 2]; + for (dii = 0; dii <= tlen; ++dii) + { + for (djj = 0; djj <= qlen; ++djj) + { + ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL; + } + } + for (dii = 1; dii <= tlen; ++dii) + { + del[dii][0] = MAX(0, h0 - o_del - e_del * dii); + score[dii][0] = del[dii][0]; + } + for (djj = 1; djj <= qlen; ++djj) + { + ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj); + score[0][djj] = ins[0][djj]; + } + ins[0][0] = del[0][0] = score[0][0] = h0; +#endif + + for (D = 1; LIKELY(D < Dloop); ++D) { + // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 + if (D > tlen) { + span = MIN(Dloop-D, w); + beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1); + } else { + span = MIN(D-1, w); + beg1 = MAX(1, ((D-w) / 2) + 1); + } + end1 = MIN(qlen, beg1+span); + + if (beg < beg1) beg = beg1; + if (end > end1) end = end1; + if (beg > end) break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug + + iend = D - (beg - 1); // ref开始计算的位置,倒序 + span = end - beg; + ibeg = iend - span - 1; // 0开始的ref索引位置 + + // 每一轮需要记录的数据 + int m = 0, mj = -1, mi = -1; + max_vec = zero_vec; + //if (print_flag) + //{ + //fprintf(stderr, "D: %d, iend: %d, jbeg: %d\n", D, iend, beg); + //} + // 要处理边界 + // 左边界 处理f (insert) + if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); m = hA1[end];} + // 上边界 + if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); } + else if (D & 1) { + hA1[beg - 1] = 0; + hA2[beg - 1] = 0; + } + + for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 存储结果 + SIMD_STORE; + } + // 剩下的计算单元 + if (j <= end) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 去除多余计算的部分 + SIMD_REMOVE_EXTRA; + // 存储结果 + SIMD_STORE; + } + + SIMD_FIND_MAX; + +#ifdef ELIMINATE_DIFF_1 +// 用来解决与BSW结果不一样的第一种情况(左边界) +#if 0 + if (hA1[0] < b && checkspecial) { + int mi; + if (hA1[0] == b - 1) { + icheck = iend + 1; + } + for (mi = 0; mi < b - 1; ++mi) { + if (midx - mi > 0) + marr[mi] = MAX(marr[mi], hA2[midx - mi]); + } + midx += 1; + if (ibeg > icheck) + { + int stopCalc = 0; + for (mi = 0; mi < b - 1; ++mi) + { + stopCalc |= !marr[mi]; + } + if (stopCalc) + break; + else + checkspecial = 0; + } + } +#else + if (hA1[0] < 4 && checkspecial) { // b == 4 + if (hA1[0] == 3) { + icheck = iend + 1; + } else if (midx == 2) { + m2 = MAX(m2, hA2[midx - 1]); + } else { + m2 = MAX(m2, hA2[midx - 1]); + m1 = MAX(m1, hA2[midx - 2]); + } + m3 = MAX(m3, hA2[midx]); + midx += 1; + if (ibeg > icheck) + { + if (!m1 || !m2 || !m3) + break; + else + checkspecial = 0; + } + + //if (print_flag) { + //fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA2[midx + 1], hA2[midx + 2], hA2[midx + 3], midx); + //if (midx > 2) fprintf(stderr, "%d, %d, %d\n", hA2[midx-1], hA2[midx-2], hA2[midx-3]); + //fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, hA1: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA1[0], m1, m2, m3, midx); + //} + } +#endif +#endif + +#ifdef DEBUG_SW_EXTEND + for (djj = beg; djj <= end; ++djj) + { + dii = D - djj + 1; + ins[dii][djj] = fA2[djj]; + del[dii][djj] = eA2[djj]; + score[dii][djj] = hA2[djj]; + } + //if (print_flag) + //{ + //fprintf(stderr, "score: %d %d %d\n", hA2[beg], hA2[beg+1], hA2[beg+2]); + //} +#endif + + // 注意最后跳出循环j的值 + j = end + 1; + + if (j == qlen + 1) { + max_ie = gscore > hA2[qlen] ? max_ie : ibeg; + gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; + } + if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点 + //if (m == 0 && m_last < 2) break; + if (m > max) { + max = m, max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } else if (m == max && max_i >= mi && mj > max_j) { + max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } + else if (zdrop > 0 && mi > -1) { + if (mi - max_i > mj - max_j) { + if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break; + } + } + + // 调整计算的边界 + for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; } + beg = j; + for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; } + end = j + 1 <= qlen? j + 1 : qlen; + + m_last = m; + // swap m, h, e, f + SWAP_DATA_POINTER; + } + +#ifdef DEBUG_FILE_OUTPUT +#ifdef DEBUG_SW_EXTEND + fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + for (djj = 0; djj < qlen; ++djj) { + fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]); + } + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); + for (dii = 0; dii <= tlen; ++dii) + { + if (dii > 0) { + fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]); + } else { + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + } + for (djj = 0; djj <= qlen; ++djj) + { + fprintf(gf[0], "%-4d", score[dii][djj]); + fprintf(gf[1], "%-4d", ins[dii][djj]); + fprintf(gf[2], "%-4d", del[dii][djj]); + } + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); + } +#endif +#endif + // free(mem); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int is_left, // 是不是向左扩展 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数 + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 +{ + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + uint8_t *qmem, *ref, *seq; + assert(h0 > 0); + // allocate memory + qp = (int8_t *)malloc(qlen * m); + eh = (eh_t *)calloc(qlen + 1, 8); + qmem = (uint8_t *)malloc(qlen + tlen); + seq=(uint8_t*)&qmem[0]; ref=(uint8_t*)&qmem[qlen]; + if (is_left) { + for (i=0; i oe_ins? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) + eh[j].h = eh[j-1].h - e_ins; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + //fprintf(stderr, "%d\n", w); + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; + beg = 0, end = qlen; + + //int print_flag = 0; //(qlen == 116 && tlen == 241); + //fprintf(stderr, "%d %d %d\n", print_flag, qlen, tlen); +#ifdef DEBUG_SW_EXTEND + int dii, djj; + int16_t ins[tlen + 1][qlen + 2]; + int16_t del[tlen + 1][qlen + 2]; + int16_t score[tlen + 1][qlen + 2]; + for (dii = 0; dii <= tlen; ++dii) + { + for (djj = 0; djj <= qlen; ++djj) + { + ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL; + } + } + for (dii = 1; dii <= tlen; ++dii) + { + del[dii][0] = MAX(0, h0 - o_del - e_del * dii); + score[dii][0] = del[dii][0]; + } + for (djj = 1; djj <= qlen; ++djj) + { + ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj); + score[0][djj] = ins[0][djj]; + } + ins[0][0] = del[0][0] = score[0][0] = h0; +#endif + +#ifdef DEBUG_FILE_OUTPUT +#ifdef COUNT_CALC_NUM + int bsw_cal_num = 0; + int real_cal_num = 0; + for (i = 0; i < tlen; ++i) + { + int beg = MAX(0, i - w); + int end = MIN(qlen, i + w + 1); + if (beg >= end) break; + bsw_cal_num += end - beg; + } + fprintf(gf[0], "start\n%d\n", bsw_cal_num); +#endif +#endif + +#ifdef ELIMINATE_DIFF_3 + int prun_end = qlen; // for test diff_3 +#endif + + for (i = 0; LIKELY(i < tlen); ++i) { + int t, f = 0, h1, m = 0, mj = -1; + int8_t *q = &qp[ref[i] * qlen]; + // apply the band and the constraint (if provided) + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; + if (end > qlen) end = qlen; // 没用 + // compute the first column + if (beg == 0) { + h1 = h0 - (o_del + e_del * (i + 1)); + if (h1 < 0) h1 = 0; + } else h1 = 0; + //m = h1; // 用来解决和VP-BSW结果不一样的第一种情况(左边界) + for (j = beg; LIKELY(j < end); ++j) { + +#ifdef DEBUG_FILE_OUTPUT +#ifdef COUNT_CALC_NUM + real_cal_num++; +#endif +#endif + +#ifdef DEBUG_SW_EXTEND + ins[i+1][j+1] = f; +#endif + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0,sw和nw的区别 + h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 + h = h > f? h : f; +#ifdef ELIMINATE_DIFF_3 + if (j >= prun_end && h==0) break; // for test diff_3 +#endif + h1 = h; // save H(i,j) to h1 for the next column + +#ifdef DEBUG_SW_EXTEND + score[i+1][j+1] = h; +#endif + mj = m > h? mj : j; // record the position where max score is achieved + m = m > h? m : h; // m is stored at eh[mj+1] + t = M - oe_del; + t = t > 0? t : 0; + e -= e_del; +#ifdef DEBUG_SW_EXTEND + del[i + 1][j + 1] = e; +#endif + e = e > t? e : t; // computed E(i+1,j) + +#ifdef DEBUG_SW_EXTEND +// del[i+1][j+1] = e; +#endif + p->e = e; // save E(i+1,j) for the next row + t = M - oe_ins; + t = t > 0? t : 0; + f -= e_ins; + f = f > t? f : t; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } + if (m == 0) break; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + //fprintf(stderr, "%d %d %d %d\n", i, mj, max_off, m); + } else if (zdrop > 0) { + if (i - max_i > mj - max_j) { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; + } + } + // update beg and end for the next round + for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑f(insert score) + beg = j; + for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); +#ifdef ELIMINATE_DIFF_3 + prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3 +#else + end = j + 2 < qlen? j + 2 : qlen; +#endif + // beg = 0; end = qlen; // uncomment this line for debugging + // if (print_flag) { + // fprintf(stderr, "beg: %d; end: %d\n", beg, end); + // } + } +#ifdef DEBUG_FILE_OUTPUT +#ifdef DEBUG_SW_EXTEND + fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + for (djj = 0; djj < qlen; ++djj) + { + fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]); + } + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); + for (dii = 0; dii <= tlen; ++dii) + { + if (dii > 0) + { + fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]); + } + else + { + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + } + for (djj = 0; djj <= qlen; ++djj) + { + fprintf(gf[0], "%-4d", score[dii][djj]); + fprintf(gf[1], "%-4d", ins[dii][djj]); + fprintf(gf[2], "%-4d", del[dii][djj]); + } + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); + } +#endif +#endif + +#ifdef DEBUG_FILE_OUTPUT +#ifdef COUNT_CALC_NUM + fprintf(gf[0], "%d\nend\n", real_cal_num); +#endif +#endif + + free(eh); free(qp); free(qmem); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} diff --git a/ksw_extend2_avx2_u8.c b/ksw_extend2_avx2_u8.c new file mode 100644 index 0000000..1b64e8b --- /dev/null +++ b/ksw_extend2_avx2_u8.c @@ -0,0 +1,454 @@ +#include +#include +#include +#include +#include +#include +#include "utils.h" + +#define ELIMINATE_DIFF_1 + +#define SIMD_WIDTH 32 + +static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { + {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} +}; + +//static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}; +#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) +//const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); +// 初始化变量 +#define SIMD_INIT \ + int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ + __m256i zero_vec; \ + __m256i max_vec; \ + __m256i oe_del_vec; \ + __m256i oe_ins_vec; \ + __m256i e_del_vec; \ + __m256i e_ins_vec; \ + __m256i h_vec_mask[SIMD_WIDTH]; \ + zero_vec = _mm256_setzero_si256(); \ + oe_del_vec = _mm256_set1_epi8(oe_del); \ + oe_ins_vec = _mm256_set1_epi8(oe_ins); \ + e_del_vec = _mm256_set1_epi8(e_del); \ + e_ins_vec = _mm256_set1_epi8(e_ins); \ + __m256i match_sc_vec = _mm256_set1_epi8(a); \ + __m256i mis_sc_vec = _mm256_set1_epi8(b); \ + __m256i amb_sc_vec = _mm256_set1_epi8(1); \ + __m256i amb_vec = _mm256_set1_epi8(4); \ + for (i = 0; i < SIMD_WIDTH; ++i) h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i])); + +/* + * e 表示当前ref的碱基被删除 + * f 表示当前seq的碱基插入 + * m 表示当前碱基匹配(可以相等,也可以不想等) + * h 表示最大值 + */ +// load向量化数据 +#define SIMD_LOAD \ + __m256i m1 = _mm256_loadu_si256((__m256i*) (&mA1[j])); \ + __m256i e1 = _mm256_loadu_si256((__m256i*) (&eA1[j])); \ + __m256i m1j1 = _mm256_loadu_si256((__m256i*) (&mA1[j-1])); \ + __m256i f1j1 = _mm256_loadu_si256((__m256i*) (&fA1[j-1])); \ + __m256i h0j1 = _mm256_loadu_si256((__m256i*) (&hA0[j-1])); \ + __m256i qs_vec = _mm256_loadu_si256((__m256i*) (&seq[j-1])); \ + __m256i ts_vec = _mm256_loadu_si256((__m256i*) (&ref[tlen - i])); + +// 比对ref和seq的序列,计算罚分 +#define SIMD_CMP_SEQ \ + __m256i match_mask_vec = _mm256_cmpeq_epi8(qs_vec, ts_vec); \ + __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \ + __m256i match_score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \ + __m256i q_amb_mask_vec = _mm256_cmpeq_epi8(qs_vec, amb_vec); \ + __m256i t_amb_mask_vec = _mm256_cmpeq_epi8(ts_vec, amb_vec); \ + __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ + __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ + mis_score_vec = _mm256_andnot_si256(amb_mask_vec, mis_score_vec); \ + mis_score_vec = _mm256_or_si256(amb_score_vec, mis_score_vec); \ + match_score_vec = _mm256_andnot_si256(amb_mask_vec, match_score_vec); + +// 向量化计算h, e, f, m +#define SIMD_COMPUTE \ + __m256i en_vec0 = _mm256_max_epu8(m1, oe_del_vec); \ + en_vec0 = _mm256_subs_epu8(en_vec0, oe_del_vec); \ + __m256i en_vec1 = _mm256_max_epu8(e1, e_del_vec); \ + en_vec1 = _mm256_subs_epu8(en_vec1, e_del_vec); \ + __m256i en_vec = _mm256_max_epu8(en_vec0, en_vec1); \ + __m256i fn_vec0 = _mm256_max_epu8(m1j1, oe_ins_vec); \ + fn_vec0 = _mm256_subs_epu8(fn_vec0, oe_ins_vec); \ + __m256i fn_vec1 = _mm256_max_epu8(f1j1, e_ins_vec); \ + fn_vec1 = _mm256_subs_epu8(fn_vec1, e_ins_vec); \ + __m256i fn_vec = _mm256_max_epu8(fn_vec0, fn_vec1); \ + __m256i mn_vec0 = _mm256_adds_epu8(h0j1, match_score_vec); \ + mn_vec0 = _mm256_max_epu8(mn_vec0, mis_score_vec); \ + mn_vec0 = _mm256_subs_epu8(mn_vec0, mis_score_vec); \ + __m256i mn_mask = _mm256_cmpeq_epi8(h0j1, zero_vec); \ + __m256i mn_vec = _mm256_andnot_si256(mn_mask, mn_vec0); \ + __m256i hn_vec0 = _mm256_max_epu8(en_vec, fn_vec); \ + __m256i hn_vec = _mm256_max_epu8(hn_vec0, mn_vec); + +// 存储向量化结果 +#define SIMD_STORE \ + max_vec = _mm256_max_epu8(max_vec, hn_vec); \ + _mm256_storeu_si256((__m256i*)&eA2[j], en_vec); \ + _mm256_storeu_si256((__m256i*)&fA2[j], fn_vec); \ + _mm256_storeu_si256((__m256i*)&mA2[j], mn_vec); \ + _mm256_storeu_si256((__m256i*)&hA2[j], hn_vec); + +// 去除多余的部分 +#define SIMD_REMOVE_EXTRA \ + en_vec = _mm256_and_si256(en_vec, h_vec_mask[end-j]); \ + fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end-j]); \ + mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end-j]); \ + hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end-j]); + +#define __max_32(xx) \ + do { \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 8)); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 4)); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 2)); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 1)); \ + maxVal[0] = MAX(maxVal[0], maxVal[16]); \ + } while (0) + +// 找最大值和位置 +#define SIMD_FIND_MAX_NEW \ + uint8_t *maxVal = (uint8_t *)&(max_vec); \ + __max_32(max_vec); \ + m = MAX(m, maxVal[0]); \ + if (maxVal[0] > 0 && m >= max) { \ + for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) { \ + int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + } \ + } \ + } + +#define SIMD_FIND_MAX \ + uint8_t *maxVal = (uint8_t *)&max_vec; \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ + m = MAX(m, maxVal[0]); \ + if (maxVal[0] > 0 && m >= max) { \ + for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) { \ + int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + } \ + } \ + } + +// 每轮迭代后,交换数组 +#define SWAP_DATA_POINTER \ + uint8_t * tmp=hA0; \ + hA0 = hA1; hA1 = hA2; hA2 = tmp; \ + tmp = eA1; eA1 = eA2; eA2 = tmp; \ + tmp = fA1; fA1 = fA2; fA2 = tmp; \ + tmp = mA1; mA1 = mA2; mA2 = tmp; + + +int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int is_left, // 是不是向左扩展 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数 + int a, // 碱基match时的分数 + int b, // 碱基mismatch时的惩罚分数(正数) + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值 + buf_t *buf) // 之前已经开辟过的缓存 +{ + uint8_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M + uint8_t *seq, *ref; + uint8_t *mem, *qtmem, *vmem; + int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; + int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int Dloop = tlen + qlen; // 循环跳出条件 + int span, beg1, end1; // 边界条件计算 + int col_size = qlen + 2 + SIMD_WIDTH; + int val_mem_size = (col_size * 9 + 31) >> 5 << 5; // 32字节的整数倍 + int mem_size = seq_size + ref_size + val_mem_size; + + SIMD_INIT; // 初始化simd用的数据 + + assert(h0 > 0); + + // allocate memory + //mem = malloc(mem_size); + if (buf->m < mem_size) { + buf->m = mem_size; + buf->addr = (uint8_t *)realloc(buf->addr, mem_size); + } + mem = buf->addr; + + qtmem = &mem[0]; + seq=(uint8_t*)&qtmem[0]; ref=(uint8_t*)&qtmem[seq_size]; + if (is_left) { + for (i=0; i mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + if (tlen < qlen) w = MIN(tlen - 1, w); + + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;; + max_off = 0; + beg = 1; end = qlen; + // init h0 + hA0[0] = h0; // 左上角 + + if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 + if (w >= qlen) { max_ie = 0; gscore = 0; } + + int m_last=0; + int iend; +#ifdef ELIMINATE_DIFF_1 + int midx = 1, icheck = 0, checkspecial = 1; + int m3 = 0, m2 = 0, m1 = 0; + // int marr[10] = {0}; + // int marr[b]; memset(marr, 0, 4 * b); +#endif + for (D = 1; LIKELY(D < Dloop); ++D) { + // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 + if (D > tlen) { + span = MIN(Dloop-D, w); + beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1); + } else { + span = MIN(D-1, w); + beg1 = MAX(1, ((D-w) / 2) + 1); + } + end1 = MIN(qlen, beg1+span); + + if (beg < beg1) beg = beg1; + if (end > end1) end = end1; + if (beg > end) break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug + + iend = D - (beg - 1); // ref开始计算的位置,倒序 + span = end - beg; + ibeg = iend - span - 1; // 0开始的ref索引位置 + + // 每一轮需要记录的数据 + int m = 0, mj = -1, mi = -1; + max_vec = zero_vec; + + // 要处理边界 + // 左边界 处理f (insert) + if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); m = hA1[end]; } + // 上边界 + if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); } + else if (D & 1) { + hA1[beg - 1] = 0; + hA2[beg - 1] = 0; + } + + for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 存储结果 + SIMD_STORE; + } + // 剩下的计算单元 + if (j <= end) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 去除多余计算的部分 + SIMD_REMOVE_EXTRA; + // 存储结果 + SIMD_STORE; + } + + SIMD_FIND_MAX; + +#ifdef ELIMINATE_DIFF_1 +#if 0 + if (hA1[0] < b && checkspecial) { + int mi; + if (hA1[0] == b - 1) { + icheck = iend + 1; + } + for (mi = 0; mi < b - 1; ++mi) { + if (midx - mi > 0) + marr[mi] = MAX(marr[mi], hA2[midx - mi]); + } + midx += 1; + if (ibeg > icheck) + { + int stopCalc = 0; + for (mi = 0; mi < b - 1; ++mi) + { + stopCalc |= !marr[mi]; + } + if (stopCalc) + break; + else + checkspecial = 0; + } + } +#else + if (hA1[0] < 4 && checkspecial) + { // b == 4 + if (hA1[0] == 3) + { + icheck = iend + 1; + } + else if (midx == 2) + { + m2 = MAX(m2, hA2[midx - 1]); + } + else + { + m2 = MAX(m2, hA2[midx - 1]); + m1 = MAX(m1, hA2[midx - 2]); + } + m3 = MAX(m3, hA2[midx]); + midx += 1; + if (ibeg > icheck) + { + if (!m1 || !m2 || !m3) + break; + else + checkspecial = 0; + } + } +#endif +#endif + + // 注意最后跳出循环j的值 + j = end + 1; + + if (j == qlen + 1) { + max_ie = gscore > hA2[qlen] ? max_ie : ibeg; + gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; + } + if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点 + if (m > max) { + max = m, max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi)? max_off : abs(mj - mi); + } + else if (m == max && max_i >= mi && mj > max_j) { + max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } + else if (zdrop > 0 && mi > -1) { + if (mi - max_i > mj - max_j) { + if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break; + } + } + + // 调整计算的边界 + for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; } + beg = j; + for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; } + end = j + 1 <= qlen? j + 1 : qlen; + + m_last = m; + // swap m, h, e, f + SWAP_DATA_POINTER; + } + + //free(mem); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} diff --git a/main.c b/main.c index 33decfa..c4584c7 100644 --- a/main.c +++ b/main.c @@ -56,8 +56,10 @@ int main_maxk(int argc, char *argv[]); int bwa_bwt2kmer(int argc, char* argv[]); // create kmer-index from bwt int bwa_bwt2fullbytesa(int argc, char* argv[]); // create full byte-based Suffix-Array int bwa_bwt2hyb(int argc, char* argv[]); // create hybrid-index +int bwa_pac2hybpac(int argc, char* argv[]); // convert pac to hyb.pac int bwa_extract_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array int bwa_extract_byte_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array +int main_shm_hyb(int argc, char* argv[]); // manage hybrid index in shared memory int hyb_test(int argc, char* argv[]); // for test @@ -86,8 +88,10 @@ static int usage() fprintf(stderr, " bwt2fullbytesa generate SA(using byte array) from BWT and Occ\n"); fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n"); fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n"); + fprintf(stderr, " pac2hybpac convert pac to hyb.pac\n"); fprintf(stderr, " extractsa generate sa from full byte suffix array\n"); fprintf(stderr, " extractbytesa generate byte sa from full byte suffix array\n"); + fprintf(stderr, " hybshm manage hybrid index in shared memory\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: To use BWA, you need to first index the genome with `bwa index'.\n" @@ -128,9 +132,11 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwt2fullbytesa") == 0) ret = bwa_bwt2fullbytesa(argc - 1, argv + 1); else if (strcmp(argv[1], "bwt2kmer") == 0) ret = bwa_bwt2kmer(argc - 1, argv + 1); else if (strcmp(argv[1], "bwt2hyb") == 0) ret = bwa_bwt2hyb(argc - 1, argv + 1); + else if (strcmp(argv[1], "pac2hybpac") == 0) ret = bwa_pac2hybpac(argc - 1, argv + 1); else if (strcmp(argv[1], "extractsa") == 0) ret = bwa_extract_sa(argc - 1, argv + 1); else if (strcmp(argv[1], "extractbytesa") == 0) ret = bwa_extract_byte_sa(argc - 1, argv + 1); - else if (strcmp(argv[1], "hybtest") == 0) ret = hyb_test(argc - 1, argv + 1); + else if (strcmp(argv[1], "hybshm") == 0) ret = main_shm_hyb(argc - 1, argv + 1); + else if (strcmp(argv[1], "hybtest") == 0) ret = hyb_test(argc - 1, argv + 1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/pemerge.c b/pemerge.c index 725885f..ef83320 100644 --- a/pemerge.c +++ b/pemerge.c @@ -222,8 +222,10 @@ int main_pemerge(int argc, char *argv[]) gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; pem_opt_t *opt; + int64_t seq_size = 0; + int m = 0; - opt = pem_opt_init(); + opt = pem_opt_init(); while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { if (c == 'm') flag |= 1; else if (c == 'u') flag |= 2; @@ -269,10 +271,11 @@ int main_pemerge(int argc, char *argv[]) } memset(cnt, 0, 8 * (MAX_ERR+1)); - while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { - process_seqs(opt, n, bseq, cnt); - free(bseq); - } + bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq); + while (n > 0) { + process_seqs(opt, n, bseq, cnt); + bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq); + } fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); for (i = 1; i <= MAX_ERR; ++i) diff --git a/profiling.c b/profiling.c index b27d055..4ca819f 100644 --- a/profiling.c +++ b/profiling.c @@ -133,9 +133,11 @@ int display_stats(int nthreads) fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq); fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq); +#if SHOW_DATA_PERF fprintf(stderr, "seq num: %ld\n", gdat[0]); fprintf(stderr, "full num: %ld\n", gdat[1]); fprintf(stderr, "percent: %0.2lf%c\n", (double)gdat[1] / gdat[0] * 100, '%'); +#endif fprintf(stderr, "all_match_len: %ld\n", all_match_len); fprintf(stderr, "all_seq_num: %ld\n", all_seq_num); diff --git a/profiling.h b/profiling.h index 5a5db13..287eb5c 100644 --- a/profiling.h +++ b/profiling.h @@ -9,6 +9,8 @@ Date : 2024/04/06 #ifndef PROFILING_H_ #define PROFILING_H_ +#include +#include #include #define USE_RDTSC 1 diff --git a/share_mem.c b/share_mem.c new file mode 100644 index 0000000..1fe07fe --- /dev/null +++ b/share_mem.c @@ -0,0 +1,167 @@ +#include "share_mem.h" + +#include +#include +#include +#include + +#include "utils.h" + +#define SHM_NAME_LIST "/shm_hybbwa_name_list" +#define SHM_HYB_PREFIX "/shm_hybbwa_" + +#define SHM_NAME_LIST_SIZE 65535 + +static inline double get_GB(double bytes) { return bytes / 1024 / 1024 / 1024; } + +// 根据文件路径获取文件名 +const char* get_fn_from_path(const char* file_path) { + const char* fn = strrchr(file_path, '/'); + if (fn != NULL) + return fn + 1; + return file_path; +} + +// 将hybrid-index保存到share memrory里 +int shm_keep_hyb(const char* idx_prefix) { + char full_path[MAX_PATH]; + const char* file_name = NULL; + char share_name[MAX_PATH]; + FILE* fp = NULL; + struct stat st; + int shmid, init_shm = 0, idx_name_len; + uint8_t *shm_idx_list, *mem; + uint16_t* shm_idx_cnt; // share memory中index数量 + uint16_t* shm_byte_cnt; // 和占用的总内存数 + double sec_time; + +///////////////// +#define __shm_keep_hyb_code(suffix) \ + sec_time = realtime(); \ + strcat(strcpy(full_path, idx_prefix), suffix); \ + file_name = get_fn_from_path(full_path); \ + strcat(strcpy(share_name, SHM_HYB_PREFIX), get_fn_from_path(full_path)); \ + if ((shmid = shm_open(share_name, O_CREAT | O_RDWR | O_EXCL, 0644)) < 0) { \ + perror("shm_open()"); \ + return -1; \ + } \ + err_check_true(stat(full_path, &st), 0); \ + if (ftruncate(shmid, st.st_size) < 0) \ + return -1; \ + idx_name_len = 8 + strlen(file_name) + 1; \ + if (idx_name_len + *shm_byte_cnt > SHM_NAME_LIST_SIZE) \ + return -1; \ + memcpy(shm_idx_list + *shm_byte_cnt, &st.st_size, 8); \ + memcpy(shm_idx_list + *shm_byte_cnt + 8, file_name, idx_name_len - 8); \ + *shm_byte_cnt += idx_name_len; \ + *shm_idx_cnt += 1; \ + mem = (uint8_t*)mmap(0, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, shmid, 0); \ + fp = xopen(full_path, "r"); \ + err_fread_noeof(mem, 1, st.st_size, fp); \ + err_fclose(fp); \ + munmap(mem, st.st_size); \ + fprintf(stderr, "%s, %0.2f GB, %0.2f s\n", file_name, get_GB(st.st_size), realtime() - sec_time); + ////////////////////// + + // 打开保存索引名称的共享内存 + if ((shmid = shm_open(SHM_NAME_LIST, O_RDWR, 0)) < 0) { + // 之前没有创建过,那就创建并初始化 + shmid = shm_open(SHM_NAME_LIST, O_CREAT | O_RDWR | O_EXCL, 0644); + init_shm = 1; + } + if (shmid < 0 || ftruncate(shmid, SHM_NAME_LIST_SIZE) < 0) + return -1; + shm_idx_list = (uint8_t*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shmid, 0); + shm_idx_cnt = (uint16_t*)shm_idx_list; + shm_byte_cnt = (uint16_t*)(shm_idx_list + 2); + + if (init_shm) { // 需要初始化share mem中的索引列表 + memset(shm_idx_list, 0, SHM_NAME_LIST_SIZE); + *shm_byte_cnt = 4; + } + + __shm_keep_hyb_code(HYB_PAC_SUFFIX); + __shm_keep_hyb_code(HYB_SA_SUFFIX); + __shm_keep_hyb_code(HYB_KMER_SUFFIX); + __shm_keep_hyb_code(HYB_DATA_SUFFIX); + return 0; +} + +// 清理所有保存在share memory中的索引数据 +int shm_clear_hyb() { + char share_name[MAX_PATH]; + int shmid; + uint16_t *shm_idx_cnt, i; + char* shm_idx_list; + char* ptr; + + if ((shmid = shm_open(SHM_NAME_LIST, O_RDONLY, 0)) < 0) + return -1; + shm_idx_list = (char*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + shm_idx_cnt = (uint16_t*)shm_idx_list; + for (i = 0, ptr = shm_idx_list + 4; i < *shm_idx_cnt; ++i) { + ptr += 8; + strcat(strcpy(share_name, SHM_HYB_PREFIX), ptr); + fprintf(stderr, "clear: %s\n", ptr); + shm_unlink(share_name); + ptr += strlen(ptr) + 1; + } + munmap(shm_idx_list, SHM_NAME_LIST_SIZE); + shm_unlink(SHM_NAME_LIST); + return 0; +} + +// 从share mem中获取对应的索引数据 +void* shm_get_index(const char* full_path) { + char share_name[MAX_PATH]; + int shmid; + uint16_t *shm_idx_cnt, i; + char* shm_idx_list; + uint64_t idx_bytes; + char* ptr; + const char* file_name = get_fn_from_path(full_path); + + if ((shmid = shm_open(SHM_NAME_LIST, O_RDONLY, 0)) < 0) + return NULL; + shm_idx_list = (char*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + shm_idx_cnt = (uint16_t*)shm_idx_list; + for (i = 0, ptr = shm_idx_list + 4; i < *shm_idx_cnt; ++i) { + memcpy(&idx_bytes, ptr, 8); + ptr += 8; + if (strcmp(ptr, file_name) == 0) + break; + ptr += strlen(ptr) + 1; + } + if (i == *shm_idx_cnt) + return NULL; + munmap(shm_idx_list, SHM_NAME_LIST_SIZE); + strcat(strcpy(share_name, SHM_HYB_PREFIX), file_name); + if ((shmid = shm_open(share_name, O_RDONLY, 0)) < 0) + return NULL; + return mmap(0, idx_bytes, PROT_READ, MAP_SHARED, shmid, 0); +} + +// 列出共享内存中的hybrid-index +int list_shm_hyb_indices() { + int shmid; + uint16_t *shm_idx_cnt, i; + char* shm_idx_list; + char* ptr; + + if ((shmid = shm_open(SHM_NAME_LIST, O_RDONLY, 0)) < 0) { + fprintf(stderr, "No shared hybrid index found.\n"); + return -1; + } + shm_idx_list = (char*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + shm_idx_cnt = (uint16_t*)shm_idx_list; + fprintf(stderr, "Shared hybrid indices (%d):\n", *shm_idx_cnt); + for (i = 0, ptr = shm_idx_list + 4; i < *shm_idx_cnt; ++i) { + uint64_t idx_bytes; + memcpy(&idx_bytes, ptr, 8); + ptr += 8; + fprintf(stderr, "%s, %0.2f GB\n", ptr, get_GB(idx_bytes)); + ptr += strlen(ptr) + 1; + } + munmap(shm_idx_list, SHM_NAME_LIST_SIZE); + return 0; +} \ No newline at end of file diff --git a/share_mem.h b/share_mem.h new file mode 100644 index 0000000..6265b66 --- /dev/null +++ b/share_mem.h @@ -0,0 +1,20 @@ +#pragma once + +#include "utils.h" + +#if 0 +#define HYB_PAC_SUFFIX ".hyb.pac" +#define HYB_SA_SUFFIX ".hyb.bytesa" +#define HYB_KMER_SUFFIX ".hyb.kmer" +#define HYB_DATA_SUFFIX ".hyb.data" +#else +#define HYB_PAC_SUFFIX ".hybrid.pac" +#define HYB_SA_SUFFIX ".hybrid.sa" +#define HYB_KMER_SUFFIX ".hybrid.kmer" +#define HYB_DATA_SUFFIX ".hybrid.data" +#endif + +int shm_keep_hyb(const char* idx_prefix); +int shm_clear_hyb(); +void* shm_get_index(const char* full_path); +int list_shm_hyb_indices(); \ No newline at end of file diff --git a/utils.c b/utils.c index 9ceb1be..a5591e8 100644 --- a/utils.c +++ b/utils.c @@ -39,9 +39,28 @@ #endif #include #include -#include "utils.h" +#include "khash.h" #include "ksort.h" +#include "kvec.h" +#include "utils.h" +#include "yarn.h" + +#define USE_ASYNC_READ + +typedef struct { + pthread_t tid; + void* buf[2]; + volatile int readSize[2]; + uint64_t getIdx; + uint64_t putIdx; + volatile int finish; + lock_t* mtx; +} FileKV; + +KHASH_MAP_INIT_INT64(fkv, FileKV); +static khash_t(fkv) * fHash = 0; + #define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) @@ -141,9 +160,38 @@ size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) int err_gzread(gzFile file, void *ptr, unsigned int len) { - int ret = gzread(file, ptr, len); + int ret = 0; + PROF_START(read); +#ifdef USE_ASYNC_READ + khiter_t k = kh_get(fkv, fHash, (int64_t)file); + FileKV* val = &kh_value(fHash, k); + POSSESS(val->mtx); + WAIT_FOR(val->mtx, NOT_TO_BE, 0); // 等待有数据 + RELEASE(val->mtx); - if (ret < 0) + int curIdx = val->getIdx % 2; + if (val->finish) { + if (val->getIdx < val->putIdx) { + ret = val->readSize[curIdx]; + if (ret > 0) + memcpy(ptr, val->buf[curIdx], ret); + ++val->getIdx; + return ret; + } + return 0; + } + ret = val->readSize[curIdx]; + memcpy(ptr, val->buf[curIdx], ret); + + POSSESS(val->mtx); + ++val->getIdx; + TWIST(val->mtx, BY, -1); +#else + ret = gzread(file, ptr, len); +#endif + PROF_END(gprof[G_read_seq], read); + + if (ret < 0) { int errnum = 0; const char *msg = gzerror(file, &errnum); @@ -304,3 +352,64 @@ long peakrss(void) return r.ru_maxrss; #endif } + +static int64_t kBufSize = 16777216; + +static void* async_gzread(void* data) { + gzFile file = (gzFile)data; + khiter_t k = kh_get(fkv, fHash, (int64_t)file); + FileKV* val = &kh_value(fHash, k); + + int ret = 0; + while (1) { + POSSESS(val->mtx); + WAIT_FOR(val->mtx, NOT_TO_BE, 2); // 等待有数据 + RELEASE(val->mtx); + + int curIdx = val->putIdx % 2; + ret = gzread(file, val->buf[curIdx], kBufSize); + val->readSize[curIdx] = ret; + + if (ret <= 0) { + POSSESS(val->mtx); + val->finish = 1; + TWIST(val->mtx, BY, 1); + break; + } + + POSSESS(val->mtx); + val->putIdx += 1; + TWIST(val->mtx, BY, 1); + } + + return NULL; +} + +int start_async_read(gzFile file) { + int ret = 0; +#ifdef USE_ASYNC_READ + if (fHash == 0) { + fHash = kh_init(fkv); + } + khiter_t k = kh_put(fkv, fHash, (int64_t)file, &ret); + kh_key(fHash, k) = (int64_t)file; + FileKV* fv = &kh_value(fHash, k); + + fv->mtx = NEW_LOCK(0); + fv->getIdx = fv->putIdx = fv->finish = 0; + fv->readSize[0] = fv->readSize[1] = 0; + fv->buf[0] = malloc(kBufSize); + fv->buf[1] = malloc(kBufSize); + ret = pthread_create(&fv->tid, 0, async_gzread, file); +#endif + return ret; +} + +int stop_async_read(gzFile file) { +#ifdef USE_ASYNC_READ + khiter_t k = kh_get(fkv, fHash, (int64_t)file); + FileKV* val = &kh_value(fHash, k); + pthread_join(val->tid, 0); +#endif + return 0; +} \ No newline at end of file diff --git a/utils.h b/utils.h index 995c9f5..c1f2d7e 100644 --- a/utils.h +++ b/utils.h @@ -28,11 +28,15 @@ #define LH3_UTILS_H #include +#include #include #include #include #include +#include "debug.h" +#include "profiling.h" + #ifdef __GNUC__ // Tell GCC to validate printf format string and args #define ATTRIBUTE(list) __attribute__ (list) @@ -121,6 +125,11 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +typedef struct { + size_t m; + uint8_t* addr; +} buf_t; + #ifdef __cplusplus extern "C" { #endif @@ -158,8 +167,11 @@ extern "C" { void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); + int start_async_read(gzFile file); + int stop_async_read(gzFile file); + #ifdef __cplusplus -} + } #endif static inline uint64_t hash_64(uint64_t key) diff --git a/yarn.c b/yarn.c new file mode 100644 index 0000000..254ba62 --- /dev/null +++ b/yarn.c @@ -0,0 +1,398 @@ +/* yarn.c -- generic thread operations implemented using pthread functions + * Copyright (C) 2008, 2011, 2012, 2015, 2018, 2019, 2020 Mark Adler + * Version 1.7 12 Apr 2020 Mark Adler + * For conditions of distribution and use, see copyright notice in yarn.h + */ + +/* Basic thread operations implemented using the POSIX pthread library. All + pthread references are isolated within this module to allow alternate + implementations with other thread libraries. See yarn.h for the description + of these operations. */ + +/* Version history: + 1.0 19 Oct 2008 First version + 1.1 26 Oct 2008 No need to set the stack size -- remove + Add yarn_abort() function for clean-up on error exit + 1.2 19 Dec 2011 (changes reversed in 1.3) + 1.3 13 Jan 2012 Add large file #define for consistency with pigz.c + Update thread portability #defines per IEEE 1003.1-2008 + Fix documentation in yarn.h for yarn_prefix + 1.4 19 Jan 2015 Allow yarn_abort() to avoid error message to stderr + Accept and do nothing for NULL argument to free_lock() + 1.5 8 May 2018 Remove destruct() to avoid use of pthread_cancel() + Normalize the code style + 1.6 3 Apr 2019 Add debugging information to fail() error messages + 1.7 12 Apr 2020 Fix use after free bug in ignition() + */ + +// For thread portability. +#define _XOPEN_SOURCE 700 +#define _POSIX_C_SOURCE 200809L +#define _THREAD_SAFE + +// Use large file functions if available. +#define _FILE_OFFSET_BITS 64 + +// External libraries and entities referenced. +#include // fprintf(), stderr +#include // exit(), malloc(), free(), NULL +#include // pthread_t, pthread_create(), pthread_join(), + // pthread_attr_t, pthread_attr_init(), pthread_attr_destroy(), + // PTHREAD_CREATE_JOINABLE, pthread_attr_setdetachstate(), + // pthread_self(), pthread_equal(), + // pthread_mutex_t, PTHREAD_MUTEX_INITIALIZER, pthread_mutex_init(), + // pthread_mutex_lock(), pthread_mutex_unlock(), pthread_mutex_destroy(), + // pthread_cond_t, PTHREAD_COND_INITIALIZER, pthread_cond_init(), + // pthread_cond_broadcast(), pthread_cond_wait(), pthread_cond_destroy() +#include // EPERM, ESRCH, EDEADLK, ENOMEM, EBUSY, EINVAL, EAGAIN + +// Interface definition. +#include "yarn.h" + +// Constants. +#define local static // for non-exported functions and globals + +// Error handling external globals, resettable by application. +char *yarn_prefix = (char*)"yarn"; +void (*yarn_abort)(int) = NULL; + + +// Immediately exit -- use for errors that shouldn't ever happen. +local void fail(int err, char const *file, long line, char const *func) { + fprintf(stderr, "%s: ", yarn_prefix); + switch (err) { + case EPERM: + fputs("already unlocked", stderr); + break; + case ESRCH: + fputs("no such thread", stderr); + break; + case EDEADLK: + fputs("resource deadlock", stderr); + break; + case ENOMEM: + fputs("out of memory", stderr); + break; + case EBUSY: + fputs("can't destroy locked resource", stderr); + break; + case EINVAL: + fputs("invalid request", stderr); + break; + case EAGAIN: + fputs("resource unavailable", stderr); + break; + default: + fprintf(stderr, "internal error %d", err); + } + fprintf(stderr, " (%s:%ld:%s)\n", file, line, func); + if (yarn_abort != NULL) + yarn_abort(err); + exit(err); +} + +// Memory handling routines provided by user. If none are provided, malloc() +// and free() are used, which are therefore assumed to be thread-safe. +typedef void *(*malloc_t)(size_t); +typedef void (*free_t)(void *); +local malloc_t my_malloc_f = malloc; +local free_t my_free = free; + +// Use user-supplied allocation routines instead of malloc() and free(). +void yarn_mem(malloc_t lease, free_t vacate) { + my_malloc_f = lease; + my_free = vacate; +} + +// Memory allocation that cannot fail (from the point of view of the caller). +local void *my_malloc(size_t size, char const *file, long line) { + void *block; + + if ((block = my_malloc_f(size)) == NULL) + fail(ENOMEM, file, line, "malloc"); + return block; +} + +// -- Lock functions -- + +struct lock_s { + pthread_mutex_t mutex; + pthread_cond_t cond; + long value; +}; + +lock_t *new_lock_(long initial, char const *file, long line) { + lock_t *bolt = (lock_t *)my_malloc(sizeof(struct lock_s), file, line); + int ret = pthread_mutex_init(&(bolt->mutex), NULL); + if (ret) + fail(ret, file, line, "mutex_init"); + ret = pthread_cond_init(&(bolt->cond), NULL); + if (ret) + fail(ret, file, line, "cond_init"); + bolt->value = initial; + return bolt; +} + +void possess_(lock_t *bolt, char const *file, long line) { + int ret = pthread_mutex_lock(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_lock"); +} + +void release_(lock_t *bolt, char const *file, long line) { + int ret = pthread_mutex_unlock(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_unlock"); +} + +void twist_(lock_t *bolt, enum twist_op op, long val, + char const *file, long line) { + if (op == TO) + bolt->value = val; + else if (op == BY) + bolt->value += val; + int ret = pthread_cond_broadcast(&(bolt->cond)); + if (ret) + fail(ret, file, line, "cond_broadcast"); + ret = pthread_mutex_unlock(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_unlock"); +} + +#define until(a) while(!(a)) + +void wait_for_(lock_t *bolt, enum wait_op op, long val, + char const *file, long line) { + switch (op) { + case TO_BE: + until (bolt->value == val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + break; + case NOT_TO_BE: + until (bolt->value != val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + break; + case TO_BE_MORE_THAN: + until (bolt->value > val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + break; + case TO_BE_LESS_THAN: + until (bolt->value < val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + } +} + +long peek_lock(lock_t *bolt) { + return bolt->value; +} + +void free_lock_(lock_t *bolt, char const *file, long line) { + if (bolt == NULL) + return; + int ret = pthread_cond_destroy(&(bolt->cond)); + if (ret) + fail(ret, file, line, "cond_destroy"); + ret = pthread_mutex_destroy(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_destroy"); + my_free(bolt); +} + +// -- Thread functions (uses the lock_t functions above) -- + +struct thread_s { + pthread_t id; + int done; // true if this thread has exited + thread *next; // for list of all launched threads +}; + +// List of threads launched but not joined, count of threads exited but not +// joined (incremented by ignition() just before exiting). +local lock_t threads_lock = { + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_COND_INITIALIZER, + 0 // number of threads exited but not joined +}; +local thread *threads = NULL; // list of extant threads + +// Structure in which to pass the probe and its payload to ignition(). +struct capsule { + void (*probe)(void *); + void *payload; + char const *file; + long line; +}; + +// Mark the calling thread as done and alert join_all(). +local void reenter(void *arg) { + struct capsule *capsule = (struct capsule *)arg; + + // find this thread in the threads list by matching the thread id + pthread_t me = pthread_self(); + possess_(&(threads_lock), capsule->file, capsule->line); + thread **prior = &(threads); + thread *match; + while ((match = *prior) != NULL) { + if (pthread_equal(match->id, me)) + break; + prior = &(match->next); + } + if (match == NULL) + fail(ESRCH, capsule->file, capsule->line, "reenter lost"); + + // mark this thread as done and move it to the head of the list + match->done = 1; + if (threads != match) { + *prior = match->next; + match->next = threads; + threads = match; + } + + // update the count of threads to be joined and alert join_all() + twist_(&(threads_lock), BY, +1, capsule->file, capsule->line); + + // free the capsule resource, even if the thread is cancelled (though yarn + // doesn't use pthread_cancel() -- you never know) + my_free(capsule); +} + +// All threads go through this routine. Just before a thread exits, it marks +// itself as done in the threads list and alerts join_all() so that the thread +// resources can be released. Use a cleanup stack so that the marking occurs +// even if the thread is cancelled. +local void *ignition(void *arg) { + struct capsule *capsule = (struct capsule *)arg; + + // run reenter() before leaving + pthread_cleanup_push(reenter, arg); + + // execute the requested function with argument + capsule->probe(capsule->payload); + + // mark this thread as done, letting join_all() know, and free capsule + pthread_cleanup_pop(1); + + // exit thread + return NULL; +} + +// Not all POSIX implementations create threads as joinable by default, so that +// is made explicit here. +thread *launch_(void (*probe)(void *), void *payload, + char const *file, long line) { + // construct the requested call and argument for the ignition() routine + // (allocated instead of automatic so that we're sure this will still be + // there when ignition() actually starts up -- ignition() will free this + // allocation) + struct capsule *capsule = (struct capsule *)my_malloc(sizeof(struct capsule), file, line); + capsule->probe = probe; + capsule->payload = payload; + capsule->file = file; + capsule->line = line; + + // assure this thread is in the list before join_all() or ignition() looks + // for it + possess_(&(threads_lock), file, line); + + // create the thread and call ignition() from that thread + thread *th = (thread *)my_malloc(sizeof(struct thread_s), file, line); + pthread_attr_t attr; + int ret = pthread_attr_init(&attr); + if (ret) + fail(ret, file, line, "attr_init"); + ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + if (ret) + fail(ret, file, line, "attr_setdetachstate"); + ret = pthread_create(&(th->id), &attr, ignition, capsule); + if (ret) + fail(ret, file, line, "create"); + ret = pthread_attr_destroy(&attr); + if (ret) + fail(ret, file, line, "attr_destroy"); + + // put the thread in the threads list for join_all() + th->done = 0; + th->next = threads; + threads = th; + release_(&(threads_lock), file, line); + return th; +} + +void join_(thread *ally, char const *file, long line) { + // wait for thread to exit and return its resources + int ret = pthread_join(ally->id, NULL); + if (ret) + fail(ret, file, line, "join"); + + // find the thread in the threads list + possess_(&(threads_lock), file, line); + thread **prior = &(threads); + thread *match; + while ((match = *prior) != NULL) { + if (match == ally) + break; + prior = &(match->next); + } + if (match == NULL) + fail(ESRCH, file, line, "join lost"); + + // remove thread from list and update exited count, free thread + if (match->done) + threads_lock.value--; + *prior = match->next; + release_(&(threads_lock), file, line); + my_free(ally); +} + +// This implementation of join_all() only attempts to join threads that have +// announced that they have exited (see ignition()). When there are many +// threads, this is faster than waiting for some random thread to exit while a +// bunch of other threads have already exited. +int join_all_(char const *file, long line) { + // grab the threads list and initialize the joined count + int count = 0; + possess_(&(threads_lock), file, line); + + // do until threads list is empty + while (threads != NULL) { + // wait until at least one thread has reentered + wait_for_(&(threads_lock), NOT_TO_BE, 0, file, line); + + // find the first thread marked done (should be at or near the top) + thread **prior = &(threads); + thread *match; + while ((match = *prior) != NULL) { + if (match->done) + break; + prior = &(match->next); + } + if (match == NULL) + fail(ESRCH, file, line, "join_all lost"); + + // join the thread (will be almost immediate), remove from the threads + // list, update the reenter count, and free the thread + int ret = pthread_join(match->id, NULL); + if (ret) + fail(ret, file, line, "join"); + threads_lock.value--; + *prior = match->next; + my_free(match); + count++; + } + + // let go of the threads list and return the number of threads joined + release_(&(threads_lock), file, line); + return count; +} \ No newline at end of file diff --git a/yarn.h b/yarn.h new file mode 100644 index 0000000..ab5d31e --- /dev/null +++ b/yarn.h @@ -0,0 +1,139 @@ +/* yarn.h -- generic interface for thread operations + * Copyright (C) 2008, 2011, 2012, 2015, 2018, 2019, 2020 Mark Adler + * Version 1.7 12 Apr 2020 Mark Adler + */ + +/* + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu + */ + +/* Basic thread operations + + This interface isolates the local operating system implementation of threads + from the application in order to facilitate platform independent use of + threads. All of the implementation details are deliberately hidden. + + Assuming adequate system resources and proper use, none of these functions + can fail. As a result, any errors encountered will cause an exit() to be + executed, or the execution of your own optionally-provided abort function. + + These functions allow the simple launching and joining of threads, and the + locking of objects and synchronization of changes of objects. The latter is + implemented with a single lock_t type that contains an integer value. The + value can be ignored for simple exclusive access to an object, or the value + can be used to signal and wait for changes to an object. + + -- Arguments -- + + thread *thread; identifier for launched thread, used by join + void probe(void *); pointer to function "probe", run when thread starts + void *payload; single argument passed to the probe function + lock_t *lock_t; a lock_t with a value -- used for exclusive access to + an object and to synchronize threads waiting for + changes to an object + long val; value to set lock_t, increment lock_t, or wait for + int n; number of threads joined + + -- Thread functions -- + + thread = launch(probe, payload) - launch a thread -- exit via probe() return + join(thread) - join a thread and by joining end it, waiting for the thread + to exit if it hasn't already -- will free the resources allocated by + launch() (don't try to join the same thread more than once) + n = join_all() - join all threads launched by launch() that are not joined + yet and free the resources allocated by the launches, usually to clean + up when the thread processing is done -- join_all() returns an int with + the count of the number of threads joined (join_all() should only be + called from the main thread, and should only be called after any calls + of join() have completed) + + -- Lock functions -- + + lock_t = new_lock(val) - create a new lock_t with initial value val (lock_t is + created in the released state) + possess(lock_t) - acquire exclusive possession of a lock_t, waiting if necessary + twist(lock_t, [TO | BY], val) - set lock_t to or increment lock_t by val, signal + all threads waiting on this lock_t and then release the lock_t -- must + possess the lock_t before calling (twist releases, so don't do a + release() after a twist() on the same lock_t) + wait_for(lock_t, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val) + - wait on lock_t value to be, not to be, be greater than, or be less than + val -- must possess the lock_t before calling, will possess the lock_t on + return but the lock_t is released while waiting to permit other threads + to use twist() to change the value and signal the change (so make sure + that the object is in a usable state when waiting) + release(lock_t) - release a possessed lock_t (do not try to release a lock_t that + the current thread does not possess) + val = peek_lock(lock_t) - return the value of the lock_t (assumes that lock_t is + already possessed, no possess or release is done by peek_lock()) + free_lock(lock_t) - free the resources allocated by new_lock() (application + must assure that the lock_t is released before calling free_lock()) + + -- Memory allocation --- + + yarn_mem(better_malloc, better_free) - set the memory allocation and free + routines for use by the yarn routines where the supplied routines have + the same interface and operation as malloc() and free(), and may be + provided in order to supply thread-safe memory allocation routines or + for any other reason -- by default malloc() and free() will be used + + -- Error control -- + + yarn_prefix - a char pointer to a string that will be the prefix for any + error messages that these routines generate before exiting -- if not + changed by the application, "yarn" will be used + yarn_abort - an external function that will be executed when there is an + internal yarn error, due to out of memory or misuse -- this function + may exit to abort the application, or if it returns, the yarn error + handler will exit (set to NULL by default for no action) + */ + +extern char *yarn_prefix; +extern void (*yarn_abort)(int); + +void yarn_mem(void *(*)(size_t), void (*)(void *)); + +typedef struct thread_s thread; +thread *launch_(void (*)(void *), void *, char const *, long); +#define LAUNCH(a, b) launch_(a, b, __FILE__, __LINE__) +void join_(thread *, char const *, long); +#define JOIN(a) join_(a, __FILE__, __LINE__) +int join_all_(char const *, long); +#define JOIN_ALL() join_all_(__FILE__, __LINE__) + +typedef struct lock_s lock_t; +lock_t *new_lock_(long, char const *, long); +#define NEW_LOCK(a) new_lock_(a, __FILE__, __LINE__) +void possess_(lock_t *, char const *, long); +#define POSSESS(a) possess_(a, __FILE__, __LINE__) +void release_(lock_t *, char const *, long); +// #define release(a) release_(a, __FILE__, __LINE__) +#define RELEASE(a) release_(a, __FILE__, __LINE__) +enum twist_op { TO, BY }; +void twist_(lock_t *, enum twist_op, long, char const *, long); +#define TWIST(a, b, c) twist_(a, b, c, __FILE__, __LINE__) +enum wait_op { + TO_BE, /* or */ NOT_TO_BE, /* that is the question */ + TO_BE_MORE_THAN, TO_BE_LESS_THAN }; +void wait_for_(lock_t *, enum wait_op, long, char const *, long); +#define WAIT_FOR(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__) +long peek_lock(lock_t *); +void free_lock_(lock_t *, char const *, long); +#define FREE_LOCK(a) free_lock_(a, __FILE__, __LINE__)