diff --git a/.vscode/settings.json b/.vscode/settings.json index 77e1001..fcdb4fc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,6 +9,9 @@ "istream": "c", "limits": "c", "bit": "c", - "numeric": "c" + "numeric": "c", + "fmt_idx.h": "c", + "kseq.h": "c", + "ksw.h": "c" } } \ No newline at end of file diff --git a/Makefile b/Makefile index 263ee60..32656fd 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,18 @@ CC= gcc #CC= clang --analyze # CFLAGS= -g -Wall -Wno-unused-function -O2 -CFLAGS= -g -Wall -Wno-unused-function -O2 +CFLAGS= -Wall -Wno-unused-function -mavx2 -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) -DUSE_AVX2 #-DUSE_MT_READ #-DFILTER_FULL_MATCH LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ bwape.o kopen.o pemerge.o maxk.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o \ - fmt_idx.o + fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread -ldl @@ -29,11 +29,11 @@ endif all:$(PROG) -bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) - #bwa:libbwa.a $(AOBJS) main.o -# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) ../sbwa/jemalloc/lib/libjemalloc.a main.o -o $@ -L. -lbwa $(LIBS) +# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) ../sbwa/jemalloc/lib/libjemalloc.a main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o $(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS) diff --git a/bntseq.c b/bntseq.c index c844992..c3c246a 100644 --- a/bntseq.c +++ b/bntseq.c @@ -423,6 +423,30 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end return seq; } +void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len, size_t *m_seq, uint8_t **seqp) +{ + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap + if (end > l_pac<<1) end = l_pac<<1; + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + if (*len > *m_seq) { + *m_seq = *len; + *seqp = realloc(*seqp, end - beg); + } + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + (*seqp)[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + (*seqp)[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing +} + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) { int64_t far_beg, far_end, len; @@ -449,3 +473,28 @@ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, in assert(seq && *end - *beg == len); // assertion failure should never happen return seq; } + +void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp) +{ + int64_t far_beg, far_end, len; + int is_rev; + + if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac<<1) - far_end; + far_end = (bns->l_pac<<1) - tmp; + } + *beg = *beg > far_beg? *beg : far_beg; + *end = *end < far_end? *end : far_end; + bns_get_seq_no_alloc(bns->l_pac, pac, *beg, *end, &len, m_seq, seqp); + if (*seqp == 0 || *end - *beg != len) { + fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", + __func__, (long)*beg, (long)mid, (long)*end, (long)len, *seqp, *rid, (long)far_beg, (long)far_end); + } + assert(*seqp && *end - *beg == len); // assertion failure should never happen +} diff --git a/bntseq.h b/bntseq.h index 354adb6..7891d61 100644 --- a/bntseq.h +++ b/bntseq.h @@ -79,7 +79,7 @@ extern "C" { uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); - + void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp); #ifdef __cplusplus } #endif diff --git a/bwa.c b/bwa.c index ba14350..3002670 100644 --- a/bwa.c +++ b/bwa.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "bntseq.h" #include "bwa.h" #include "ksw.h" @@ -58,31 +59,160 @@ static inline void trim_readno(kstring_t *s) s->l -= 2, s->s[s->l] = 0; } -static inline char *dupkstring(const kstring_t *str, int dupempty) +static inline void dupkstring(const kstring_t *str, int dupempty, char **dstp, int *sm) { - char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL; - if (!s) return NULL; + if (!dupempty && str->l == 0) { + if (*dstp) free(*dstp); + *dstp = 0; *sm = 0; + } else if (*dstp == 0 || *sm < str->l) { + *sm = str->l; + *dstp = realloc(*dstp, str->l + 1); + } + + char *s = *dstp; + if (!s) return; memcpy(s, str->s, str->l); s[str->l] = '\0'; - return s; } -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s, int copy_comment) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = dupkstring(&ks->name, 1); - s->comment = dupkstring(&ks->comment, 0); - s->seq = dupkstring(&ks->seq, 1); - s->qual = dupkstring(&ks->qual, 0); + dupkstring(&ks->name, 1, &s->name, &s->m_name); + if (copy_comment) dupkstring(&ks->comment, 0, &s->comment, &s->m_comment); + dupkstring(&ks->seq, 1, &s->seq, &s->m_seq); + dupkstring(&ks->qual, 0, &s->qual, &s->m_qual); s->l_seq = ks->seq.l; } -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +typedef struct { + kseq_t *ks; + bseq1_t *seq; + int start_pos; + int n_bound; + int copy_comment; + int ret_n; + int ret_size; + int ret_status; + int chunk_size; +} read_data_t; + +static void *thread_bseq_read(void *data) { + read_data_t *d = (read_data_t*) data; + kseq_t *ks = d->ks; + bseq1_t *seqs = d->seq; + int copy_comment = d->copy_comment; + int chunk_size = d->chunk_size; + int cur_n = 0, cur_pos = d->start_pos, size = 0; + int ret_status = 1; + + while (cur_n < d->n_bound && (ret_status = kseq_read(ks)) >= 0) { + trim_readno(&ks->name); + kseq2bseq1(ks, seqs + cur_pos, copy_comment); + seqs[cur_pos].id = cur_pos; + size += seqs[cur_pos].l_seq; + cur_pos += 2; cur_n += 1; + if (size >= chunk_size) break; + } + d->ret_n = cur_n; d->ret_size = size; d->ret_status = ret_status; + return 0; +} + +#define READ_ONE_SEQ(ks) \ + trim_readno(&ks->name); \ + kseq2bseq1(ks, &seqs[n], copy_comment); \ + seqs[n].id = n; \ + size += seqs[n++].l_seq; + +// multi thread reading input seqs +void bseq_read_pe_mt(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_comment, int64_t *size_, int *m_, bseq1_t **seqs_ptr) { + kseq_t *ks = (kseq_t *)ks1_, *ks2 = (kseq_t *)ks2_; + int size = 0, m = *m_, n = 0; + bseq1_t *seqs = *seqs_ptr; + read_data_t d[2]; + pthread_t tid[2]; + + if (m == 0) { // 还没开辟空间,要初始化 + seqs = calloc(20, sizeof(bseq1_t)); // 先读取20个reads,根据reads的长度和chunk size决定要读取多少条reads + int ks1_ret = 0, ks2_ret = 0; + int i = 10; + while (i-- > 0) { + ks1_ret = kseq_read(ks); + if (ks1_ret < 0) break; + ks2_ret = kseq_read(ks2); + if (ks2_ret < 0) { + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + READ_ONE_SEQ(ks); + READ_ONE_SEQ(ks2); + } + if (ks1_ret < 0 || ks2_ret < 0) { + if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; *seqs_ptr = seqs; + return; + } + // 重新开辟空间 + m = (chunk_size + size / 10 - 1) / (size / 10) * 2; + *m_ = m; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, sizeof(bseq1_t)); + *seqs_ptr = seqs; + } + + d[0].copy_comment = copy_comment; d[1].copy_comment = copy_comment; + d[0].ks = ks ; d[0].seq = &seqs[0]; d[0].n_bound = (m >> 1) - (n>>1); d[0].start_pos = n; + d[1].ks = ks2; d[1].seq = &seqs[0]; d[1].n_bound = (m >> 1) - (n>>1); d[1].start_pos = n+1; + d[0].chunk_size = d[1].chunk_size = (chunk_size + 1) >> 1; + + pthread_create(&tid[0], 0, thread_bseq_read, &d[0]); + pthread_create(&tid[1], 0, thread_bseq_read, &d[1]); + pthread_join(tid[0], 0); pthread_join(tid[1], 0); + + size += d[0].ret_size + d[1].ret_size; + n += d[0].ret_n + d[1].ret_n; + + if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } else if (size < chunk_size && d[0].ret_status > 0 && d[1].ret_status > 0) { + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, (m-n) * sizeof(bseq1_t)); + } + READ_ONE_SEQ(ks); + if (ks2) { + READ_ONE_SEQ(ks2); + } + if (size >= chunk_size && (n&1) == 0) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + } + + *n_ = n; *size_ = size; + if (m > *m_) *m_ = m; + *seqs_ptr = seqs; +} + +void bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_comment, int64_t *size_, int *m_, bseq1_t **seqs_ptr) +{ +#ifdef USE_MT_READ + if (ks2_) return bseq_read_pe_mt(chunk_size, n_, ks1_, ks2_, copy_comment, size_, m_, seqs_ptr); +#endif kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; + bseq1_t *seqs = *seqs_ptr; + n = 0; m = *m_; while (kseq_read(ks) >= 0) { if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); @@ -91,16 +221,11 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) if (n >= m) { m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, (m-n) * sizeof(bseq1_t)); } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - seqs[n].id = n; - size += seqs[n++].l_seq; + READ_ONE_SEQ(ks); if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - seqs[n].id = n; - size += seqs[n++].l_seq; + READ_ONE_SEQ(ks2); } if (size >= chunk_size && (n&1) == 0) break; } @@ -108,8 +233,9 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } - *n_ = n; - return seqs; + *n_ = n; *size_ = size; + if (m > *m_) *m_ = m; + *seqs_ptr = seqs; } void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]) @@ -339,9 +465,10 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) return 0; } idx = calloc(1, sizeof(bwaidx_t)); - if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); + if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint); - idx->bwt->kmer_hash = idx->fmt->kmer_hash; + //if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); + //idx->bwt->kmer_hash = idx->fmt->kmer_hash; if (which & BWA_IDX_BNS) { diff --git a/bwa.h b/bwa.h index aa155b6..5e125a8 100644 --- a/bwa.h +++ b/bwa.h @@ -29,6 +29,7 @@ #include #include "bntseq.h" +#include "kstring.h" #include "bwt.h" #include "fmt_idx.h" @@ -59,7 +60,9 @@ typedef struct { typedef struct { int l_seq, id; - char *name, *comment, *seq, *qual, *sam; + int m_name, m_comment, m_seq, m_qual; + char *name, *comment, *seq, *qual; + kstring_t sam; } bseq1_t; extern int bwa_verbose, bwa_dbg; @@ -69,7 +72,8 @@ extern char bwa_rg_id[256]; extern "C" { #endif - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + //bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_comment, int64_t *size_, int *m_, bseq1_t **seqs_ptr); void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); void bwa_fill_scmat(int a, int b, int8_t mat[25]); diff --git a/bwamem.c b/bwamem.c index 3a264ff..1a407e7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -117,30 +117,7 @@ mem_opt_t *mem_opt_init() #define intv_lt(a, b) ((a).info < (b).info) KSORT_INIT(mem_intv, bwtintv_t, intv_lt) -typedef struct { - bwtintv_v mem, mem1, *tmpv[2]; -} smem_aux_t; - -static smem_aux_t *smem_aux_init() -{ - smem_aux_t *a; - a = calloc(1, sizeof(smem_aux_t)); - a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); - a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); - return a; -} - -static void smem_aux_destroy(smem_aux_t *a) -{ - free(a->tmpv[0]->a); free(a->tmpv[0]); - free(a->tmpv[1]->a); free(a->tmpv[1]); - free(a->mem.a); free(a->mem1.a); - free(a); -} - -#define USE_FMT 1 - -static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *a) +static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *a) { int i, k, x = 0, old_n; int start_width = 1; @@ -149,131 +126,79 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn int start_N_num = 0, start_flag = 1; a->mem.n = 0; - // first pass: find all SMEMs - fprintf(fp1, "seq: %ld\n", dn++); - // fprintf(stderr, "seq: %ld\n", dn++); - // dn ++; - // goto third_seed; - +#ifdef SHOW_PERF + int64_t tmp_time = realtime_msec(); +#endif + // 1. first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { start_flag = 0; -#ifdef SHOW_PERF - int64_t tmp_time = realtime_msec(); -#endif -#if USE_FMT x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, &a->mem1, a->tmpv[0]); -#else - x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); -#endif -#ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_seed_1, tmp_time); -#endif - s1n += a->mem1.n; for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; - //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); - //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); int slen = (uint32_t)p->info - (p->info >> 32); // seed length - s1l += slen; - max_seed_len = fmax(max_seed_len, slen); + max_seed_len = MAX(max_seed_len, slen); if (slen >= opt->min_seed_len) { - //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); kv_push(bwtintv_t, a->mem, *p); } } } else { ++x; - if (start_flag) - ++start_N_num; + if (start_flag) ++start_N_num; } } - // second pass: find MEMs inside a long SMEM - //if (max_seed_len == len - start_N_num) - // goto collect_intv_end; - //goto third_seed; +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_seed_1, tmp_time); + tmp_time = realtime_msec(); +#endif +#ifdef FILTER_FULL_MATCH + if (max_seed_len == len - start_N_num) goto collect_intv_end; +#endif + + // 2. second pass: find MEMs inside a long SMEM old_n = a->mem.n; for (k = 0; k < old_n; ++k) { bwtintv_t *p = &a->mem.a[k]; int start = p->info>>32, end = (int32_t)p->info; if (end - start < split_len || p->x[2] > opt->split_width) continue; -#ifdef SHOW_PERF - int64_t tmp_time = realtime_msec(); -#endif -#if USE_FMT fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, &a->mem1, a->tmpv[0]); -#else - bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); -#endif -#ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_seed_2, tmp_time); -#endif - s2n += a->mem1.n; - for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; - //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); - // fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - // fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); int slen = (uint32_t)p->info - (p->info >> 32); - s2l += slen; if (slen >= opt->min_seed_len) { - g_num_smem2 += 1; - fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } } } - //if (max_seed_len == len - start_N_num) - // goto collect_intv_end; - //goto collect_intv_end; -third_seed: - // third pass: LAST-like +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_seed_2, tmp_time); + tmp_time = realtime_msec(); +#endif + // 3. third pass: LAST-like if (opt->max_mem_intv > 0) { x = 0; while (x < len) { if (seq[x] < 4) { - if (1) { - bwtintv_t m; -#ifdef SHOW_PERF - int64_t tmp_time = realtime_msec(); -#endif - -#if USE_FMT - x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); -#else - x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); -#endif -#ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_seed_3, tmp_time); -#endif - s3n += 1; - s3l += (uint32_t)m.info - (m.info >> 32); - // bwtintv_t *p = &m; - // fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - - if (m.x[2] > 0) { - kv_push(bwtintv_t, a->mem, m); - //bwtintv_t *p = &m; - //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); - //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - } - } else { // for now, we never come to this block which is slower - x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); - for (i = 0; i < a->mem1.n; ++i) - kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + bwtintv_t m; + x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); + if (m.x[2] > 0) { + kv_push(bwtintv_t, a->mem, m); } } else ++x; } } +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_seed_3, tmp_time); +#endif -//collect_intv_end: - // sort +#ifdef FILTER_FULL_MATCH +collect_intv_end: +#endif + // sort ks_introsort(mem_intv, a->mem.n, a->mem.a); } @@ -281,22 +206,6 @@ third_seed: * Chaining * ************/ -typedef struct { - int64_t rbeg; - int32_t qbeg, len; - int score; -} mem_seed_t; // unaligned memory - -typedef struct { - int n, m, first, rid; - uint32_t w:29, kept:2, is_alt:1; - float frac_rep; - int64_t pos; - mem_seed_t *seeds; -} mem_chain_t; - -typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; - #include "kbtree.h" #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) @@ -364,7 +273,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) +mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) { int i, b, e, l_rep; int64_t l_pac = bns->l_pac; @@ -376,8 +285,37 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); - aux = buf? (smem_aux_t*)buf : smem_aux_init(); - mem_collect_intv(opt, bwt, fmt, len, seq, aux); + aux = (smem_aux_t*)buf; + mem_collect_intv(opt, fmt, len, seq, aux); + +#define CHECK_ADD_CHAIN(tmp, lower, upper) \ + int rid, to_add = 0; \ + mem_chain_t tmp, *lower, *upper; \ + tmp.pos = s.rbeg; \ + s.qbeg = p->info >> 32; \ + s.score = s.len = slen; \ + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); \ + if (rid < 0) \ + continue; \ + if (kb_size(tree)) \ + { \ + kb_intervalp(chn, tree, &tmp, &lower, &upper); \ + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) \ + to_add = 1; \ + } \ + else \ + to_add = 1; \ + if (to_add) \ + { \ + tmp.n = 1; \ + tmp.m = 4; \ + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); \ + tmp.seeds[0] = s; \ + tmp.rid = rid; \ + tmp.is_alt = !!bns->anns[rid].is_alt; \ + kb_putp(chn, tree, &tmp); \ + } + for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep bwtintv_t *p = &aux->mem.a[i]; int sb = (p->info>>32), se = (uint32_t)p->info; @@ -390,92 +328,28 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm bwtintv_t *p = &aux->mem.a[i]; int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive - //if (0) { + if (p->num_match > 0) { - //continue; - for (k = 0; k < p->num_match; ++k) { - mem_chain_t tmp, *lower, *upper; - mem_seed_t s; - int rid, to_add = 0; - s.rbeg = p->rm[k].rs; - if (p->rm[k].reverse) { - s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg; - } - tmp.pos = s.rbeg; - s.qbeg = p->info >> 32; - s.score = s.len = slen; - rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); - if (rid < 0) continue; - if (kb_size(tree)) - { - kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) - to_add = 1; - } - else - to_add = 1; - if (to_add) - { // add the seed as a new chain - tmp.n = 1; - tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); - tmp.seeds[0] = s; - tmp.rid = rid; - tmp.is_alt = !!bns->anns[rid].is_alt; - kb_putp(chn, tree, &tmp); - } - } - continue; - } - step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; - for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { - mem_chain_t tmp, *lower, *upper; mem_seed_t s; - int rid, to_add = 0; + s.rbeg = p->rm[0].rs; + if (p->rm[0].reverse) s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg; + CHECK_ADD_CHAIN(tmp, lower, upper); + } else { + step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; + for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { + mem_seed_t s; #ifdef SHOW_PERF - int64_t tmp_time = realtime_msec(); -#endif -#if USE_FMT - s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k); - // uint64_t tpos = bwt_sa(bwt, p->x[0] + k); - // if (s.rbeg != tpos) { - // fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos); - // } -#else - s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference + int64_t tmp_time = realtime_msec(); #endif + s.rbeg = fmt_sa(fmt, p->x[0] + k); #ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_sa, tmp_time); - __sync_fetch_and_add(&num_sa, 1); + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_bwt_sa, tmp_time); #endif - s.qbeg = p->info >> 32; - s.score= s.len = slen; -#ifdef SHOW_PERF - tmp_time = realtime_msec(); -#endif - rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); -#ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bns, tmp_time); -#endif - if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!! - if (kb_size(tree)) { - kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; - } else to_add = 1; - if (to_add) { // add the seed as a new chain - tmp.n = 1; tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); - tmp.seeds[0] = s; - tmp.rid = rid; - tmp.is_alt = !!bns->anns[rid].is_alt; - kb_putp(chn, tree, &tmp); + CHECK_ADD_CHAIN(tmp, lower, upper); } } } - if (buf == 0) smem_aux_destroy(aux); kv_resize(mem_chain_t, chain, kb_size(tree)); @@ -485,8 +359,8 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); - kb_destroy(chn, tree); + return chain; } @@ -744,7 +618,7 @@ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id #define MEM_MINSC_COEF 5.5f #define MEM_SEEDSW_COEF 0.05f -int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s) +int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s, buf_t *buf) { int qb, qe, rid; int64_t rb, re, mid, l_pac = bns->l_pac; @@ -766,12 +640,14 @@ int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, i if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; // the seed seems good enough; no need to do SW rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid); + //bns_fetch_seq_no_alloc(bns, pac, &rb, mid, &re, &rid, &buf->m, &buf->addr); rseq = buf->addr; + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); return x.score; } -void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a) +void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a, buf_t *buf) { double min_l = opt->min_chain_weight? MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query); int i, j, k, min_HSP_score = (int)(opt->a * min_l + .499); @@ -780,7 +656,7 @@ void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint mem_chain_t *c = &a[i]; for (j = k = 0; j < c->n; ++j) { mem_seed_t *s = &c->seeds[j]; - s->score = mem_seed_sw(opt, bns, pac, l_query, query, s); + s->score = mem_seed_sw(opt, bns, pac, l_query, query, s, buf); if (s->score < 0 || s->score >= min_HSP_score) { s->score = s->score < 0? s->len * opt->a : s->score; c->seeds[k++] = *s; @@ -805,13 +681,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) #define MAX_BAND_TRY 2 -void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf) { int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; uint64_t *srt; + smem_aux_t *aux = (smem_aux_t*)buf; if (c->n == 0) return; // get the max possible span @@ -833,6 +710,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac } // retrieve the reference sequence rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); + //bns_fetch_seq_no_alloc(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid, &aux->seq_buf->m, &aux->seq_buf->addr); rseq = aux->seq_buf->addr; + assert(c->rid == rid); srt = malloc(c->n * 8); @@ -889,25 +768,31 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name); if (s->qbeg) { // left extension - uint8_t *rs, *qs; int qle, tle, gtle, gscore; + tmp = s->rbeg - rmax[0]; +#ifndef USE_AVX2 + uint8_t *rs, *qs; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; - tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; +#endif for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; if (bwa_verbose >= 4) { int j; - printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); - printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); + printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rseq[tmp - 1 - j]]); putchar('\n'); + printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)query[s->qbeg - 1 - j]]); putchar('\n'); } #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); +#ifndef USE_AVX2 + a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); +#else + a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_ksw_extend2, tmp_time); @@ -923,7 +808,9 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac a->qb = 0, a->rb = s->rbeg - gtle; a->truesc = gscore; } +#ifndef USE_AVX2 free(qs); free(rs); +#endif } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension @@ -942,7 +829,11 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); +#ifndef USE_AVX2 + a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); +#else + a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_ksw_extend2, tmp_time); @@ -972,7 +863,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac a->frac_rep = c->frac_rep; } - free(srt); free(rseq); + free(srt); + free(rseq); } /***************************** @@ -1197,7 +1089,7 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a) void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); - kstring_t str; + // kstring_t str; kvec_t(mem_aln_t) aa; int k, l; char **XA = 0; @@ -1205,7 +1097,8 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, if (!(opt->flag & MEM_F_ALL)) XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); - str.l = str.m = 0; str.s = 0; + // str.l = str.m = 0; str.s = 0; + s->sam.l = 0; for (k = l = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; @@ -1228,21 +1121,21 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; - mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); + mem_aln2sam(opt, bns, &s->sam, s, 1, &t, 0, m); } else { for (k = 0; k < aa.n; ++k) - mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); + mem_aln2sam(opt, bns, &s->sam, s, aa.n, aa.a, k, m); for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); free(aa.a); } - s->sam = str.s; + // s->sam = str.s; if (XA) { for (k = 0; k < a->n; ++k) free(XA[k]); free(XA); } } -mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) { int i; mem_chain_v chn; @@ -1251,16 +1144,16 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - chn = mem_chain(opt, bwt, fmt, bns, l_seq, (uint8_t*)seq, buf); + chn = mem_chain(opt, fmt, bns, l_seq, (uint8_t*)seq, buf); chn.n = mem_chain_flt(opt, chn.n, chn.a); - mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a); + mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t *)seq, chn.n, chn.a, ((smem_aux_t *)buf)->seq_buf); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); + mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s, buf); free(chn.a[i].seeds); } free(chn.a); @@ -1352,30 +1245,17 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * return a; } -typedef struct { - const mem_opt_t *opt; - const bwt_t *bwt; - const FMTIndex *fmt; - const bntseq_t *bns; - const uint8_t *pac; - const mem_pestat_t *pes; - smem_aux_t **aux; - bseq1_t *seqs; - mem_alnreg_v *regs; - int64_t n_processed; -} worker_t; - static void worker1(void *data, int i, int tid) { - worker_t *w = (worker_t*)data; + mem_worker_t *w = (mem_worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); - w->regs[i] = mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); + w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); } else { if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); - w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); - w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); } } @@ -1383,7 +1263,7 @@ static void worker2(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); - worker_t *w = (worker_t*)data; + mem_worker_t *w = (mem_worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); @@ -1397,33 +1277,73 @@ static void worker2(void *data, int i, int tid) } } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +static smem_aux_t *smem_aux_init() +{ + smem_aux_t *a; + a = calloc(1, sizeof(smem_aux_t)); + a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); + a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); + a->sw_buf = calloc(1, sizeof(buf_t)); + a->seq_buf = calloc(1, sizeof(buf_t)); + return a; +} + +static void smem_aux_destroy(smem_aux_t *a) +{ + free(a->tmpv[0]->a); free(a->tmpv[0]); + free(a->tmpv[1]->a); free(a->tmpv[1]); + free(a->mem.a); free(a->mem1.a); + free(a); +} + +// 初始化线程需要的数据 +mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac) +{ + int i; + mem_worker_t *w = calloc(1, sizeof(mem_worker_t)); + w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt; + w->aux = malloc(opt->n_threads * sizeof(smem_aux_t)); + for (i = 0; i < opt->n_threads; ++i) + w->aux[i] = smem_aux_init(); + return w; +} + +void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) { extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); - worker_t w; mem_pestat_t pes[4]; double ctime, rtime; - int i; ctime = cputime(); rtime = realtime(); - global_bns = bns; - w.regs = malloc(n * sizeof(mem_alnreg_v)); - w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; - w.seqs = seqs; w.n_processed = n_processed; - w.pes = &pes[0]; w.fmt = fmt; - w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); - for (i = 0; i < opt->n_threads; ++i) - w.aux[i] = smem_aux_init(); - kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions - for (i = 0; i < opt->n_threads; ++i) - smem_aux_destroy(w.aux[i]); - free(w.aux); + global_bns = w->bns; + + if (w->n < n) { + w->n = n; + w->regs = realloc(w->regs, n * sizeof(mem_alnreg_v)); + } + w->seqs = seqs; w->n_processed = n_processed; + w->pes = &pes[0]; + +#ifdef SHOW_PERF + int64_t tmp_time = realtime_msec(); +#endif + kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_work1, tmp_time); + tmp_time = realtime_msec(); +#endif + if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 - else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data + else mem_pestat(opt, w->bns->l_pac, n, w->regs, pes); // otherwise, infer the insert size distribution from data } - kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - free(w.regs); + kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_work2, tmp_time); +#endif + //free(w.regs); if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); } diff --git a/bwamem.h b/bwamem.h index 6f92c5e..4a615c0 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,6 +31,7 @@ #include "bntseq.h" #include "bwa.h" #include "fmt_idx.h" +#include "utils.h" #define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 @@ -124,9 +125,46 @@ typedef struct { // This struct is only used for the convenience of API. int score, sub, alt_sc; } mem_aln_t; +typedef struct { + int64_t rbeg; + int32_t qbeg, len; + int score; +} mem_seed_t; // unaligned memory + +typedef struct { + int n, m, first, rid; + uint32_t w:29, kept:2, is_alt:1; + float frac_rep; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + +typedef struct { + bwtintv_v mem, mem1, *tmpv[2]; + buf_t *sw_buf, *seq_buf; +} smem_aux_t; + +typedef struct { + const mem_opt_t *opt; + const bwt_t *bwt; + const FMTIndex *fmt; + const bntseq_t *bns; + const uint8_t *pac; + const mem_pestat_t *pes; + smem_aux_t **aux; + bseq1_t *seqs; + mem_alnreg_v *regs; + int64_t n_processed; + int64_t n; +} mem_worker_t; + + #ifdef __cplusplus extern "C" { #endif + mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac); smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); @@ -159,7 +197,7 @@ extern "C" { * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); /** * Find the aligned regions for one query sequence diff --git a/bwamem_pair.c b/bwamem_pair.c index ef79521..bba5c2b 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -377,12 +377,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co aa[i][n_aa[i]++] = g[i]; } } + s[0].sam.l = 0; for (i = 0; i < n_aa[0]; ++i) - mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits - s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(opt, bns, &s[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits + s[1].sam.l = 0; for (i = 0; i < n_aa[1]; ++i) - mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits - s[1].sam = str.s; + mem_aln2sam(opt, bns, &s[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); // free for (i = 0; i < 2; ++i) { diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index d225187..5a996d4 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -130,7 +130,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; @@ -158,7 +158,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; @@ -728,10 +728,11 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c { gzFile fp, fp2; kseq_t *ks, *ks2; - int l, is_pe = 0, i, n; + int l, is_pe = 0, i, n, m = 0; + int64_t seq_size = 0; uint8_t *pac; bsw2seq_t *_seq; - bseq1_t *bseq; + bseq1_t *bseq = 0; pac = calloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) @@ -745,7 +746,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq); + while (n > 0) { int size = 0; if (n > _seq->max) { _seq->max = n; @@ -761,10 +763,11 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c size += p->l; } fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); - free(bseq); process_seqs(_seq, opt, bns, pac, target, is_pe); + bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq); } // free + if (bseq) free(bseq); free(pac); free(_seq->seq); free(_seq); kseq_destroy(ks); diff --git a/fastmap.c b/fastmap.c index 6a3aa35..ea0ca56 100644 --- a/fastmap.c +++ b/fastmap.c @@ -56,7 +56,10 @@ int64_t time_ksw_extend2 = 0, time_bwt_occ4 = 0, time_bwt_sa = 0, time_bwt_sa_read = 0, - time_bns = 0; + time_bns = 0, + time_work1 = 0, + time_work2 = 0, + time_flt_chain = 0; int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0; int64_t s1n = 0, s2n = 0, s3n = 0, s1l = 0, s2l = 0, s3l = 0; @@ -71,6 +74,12 @@ void *kopen(const char *fn, int *_fd); int kclose(void *a); void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); +typedef struct { + int n_seqs; + int m_seqs; + bseq1_t *seqs; +} ktp_data_t; + typedef struct { kseq_t *ks, *ks2; mem_opt_t *opt; @@ -78,40 +87,30 @@ typedef struct { int64_t n_processed; int copy_comment, actual_chunk_size; bwaidx_t *idx; + mem_worker_t *w; + int data_idx; // pingpong buffer index + ktp_data_t *data; } ktp_aux_t; -typedef struct { - ktp_aux_t *aux; - int n_seqs; - bseq1_t *seqs; -} ktp_data_t; - static void *process(void *shared, int step, void *_data) { ktp_aux_t *aux = (ktp_aux_t*)shared; ktp_data_t *data = (ktp_data_t*)_data; int i; if (step == 0) { - ktp_data_t *ret; + ktp_data_t *ret = aux->data + aux->data_idx; + aux->data_idx = !aux->data_idx; int64_t size = 0; - ret = calloc(1, sizeof(ktp_data_t)); - ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2); - if (ret->seqs == 0) { - free(ret); + bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2, aux->copy_comment, &size, &ret->m_seqs, &ret->seqs); + if (ret->n_seqs == 0) { return 0; } - if (!aux->copy_comment) - for (i = 0; i < ret->n_seqs; ++i) { - free(ret->seqs[i].comment); - ret->seqs[i].comment = 0; - } - for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size); return ret; } else if (step == 1) { const mem_opt_t *opt = aux->opt; - const bwaidx_t *idx = aux->idx; + // const bwaidx_t *idx = aux->idx; if (opt->flag & MEM_F_SMARTPE) { bseq1_t *sep[2]; int n_sep[2]; @@ -121,28 +120,25 @@ static void *process(void *shared, int step, void *_data) fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); if (n_sep[0]) { tmp_opt.flag &= ~MEM_F_PE; - mem_process_seqs(&tmp_opt, idx->bwt, idx->fmt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0); + mem_process_seqs(&tmp_opt, aux->w, aux->n_processed, n_sep[0], sep[0], 0); for (i = 0; i < n_sep[0]; ++i) data->seqs[sep[0][i].id].sam = sep[0][i].sam; } if (n_sep[1]) { tmp_opt.flag |= MEM_F_PE; - mem_process_seqs(&tmp_opt, idx->bwt, idx->fmt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); + mem_process_seqs(&tmp_opt, aux->w, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); for (i = 0; i < n_sep[1]; ++i) data->seqs[sep[1][i].id].sam = sep[1][i].sam; } free(sep[0]); free(sep[1]); } else - mem_process_seqs(opt, idx->bwt, idx->fmt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0); + mem_process_seqs(opt, aux->w, aux->n_processed, data->n_seqs, data->seqs, aux->pes0); aux->n_processed += data->n_seqs; return data; } else if (step == 2) { for (i = 0; i < data->n_seqs; ++i) { - if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout); - free(data->seqs[i].name); free(data->seqs[i].comment); - free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam); + if (data->seqs[i].sam.l) err_fputs(data->seqs[i].sam.s, stdout); } - free(data->seqs); free(data); return 0; } return 0; @@ -166,8 +162,9 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) int main_mem(int argc, char *argv[]) { +#ifdef SHOW_PERF fp1 = fopen("./test_out.txt", "w"); - +#endif mem_opt_t *opt, opt0; int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0; int fixed_chunk_size = -1; @@ -418,6 +415,8 @@ int main_mem(int argc, char *argv[]) } } + aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->bns, aux.idx->pac); + aux.data = calloc(2, sizeof(ktp_data_t)); bwa_print_sam_hdr(aux.idx->bns, hdr_line); aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; @@ -434,20 +433,14 @@ int main_mem(int argc, char *argv[]) #ifdef SHOW_PERF fprintf(stderr, "\n"); + fprintf(stderr, "time_work1: %f s\n", time_work1 / 1000.0 / 1); + fprintf(stderr, "time_work2: %f s\n", time_work2 / 1000.0 / 1); + fprintf(stderr, "time_flt_chain: %f s\n", time_flt_chain / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads); - fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads); - fprintf(stderr, "s1 num: %ld\n", s1n); - fprintf(stderr, "s2 num: %ld\n", s2n); - fprintf(stderr, "s3 num: %ld\n", s3n); - fprintf(stderr, "s1 len: %ld\n", s1l / s1n); - fprintf(stderr, "s2 len: %ld\n", s2l / s2n); - fprintf(stderr, "s3 len: %ld\n", s3l / s3n); - fprintf(stderr, "get sa num: %ld\n", num_sa); - fprintf(stderr, "seed 2 num: %ld\n", g_num_smem2); fprintf(stderr, "\n"); fclose(fp1); diff --git a/fmt_idx.c b/fmt_idx.c index 95578b4..c7b3f4e 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -565,19 +565,19 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv if (q[i] < 4 && q[i + 1] < 4) { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); -// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); -// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_CHANGE(ik, ok1, i + 1); CHECK_INTV_CHANGE(ik, ok2, i + 2); // 在这里进行判断是否只有一个候选了 - //if (min_intv == 1 && ok2.x[2] == min_intv) - //{ - // direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); - // kv_push(bwtintv_t, *mem, mt); - // ret = (uint32_t)mt.info; - // if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; - // goto backward_search; - //} + if (min_intv == 1 && ok2.x[2] == min_intv) + { + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); + kv_push(bwtintv_t, *mem, mt); + ret = (uint32_t)mt.info; + if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; + goto backward_search; + } } else if (q[i] < 4) // q[i+1] >= 4 { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); @@ -639,8 +639,8 @@ backward_search: if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 { fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); -// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); -// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); @@ -694,9 +694,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置 ik.info = x + kmer_len; - //fmt_set_intv(fmt, q[x], ik); - //ik.info = x + 1; - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); @@ -713,8 +710,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in if (q[i] < 4 && q[i + 1] < 4) { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); -// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); -// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len); ik = ok2; diff --git a/fmt_idx.h b/fmt_idx.h index bc55b74..3b3c628 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -32,9 +32,11 @@ Date : 2023/12/24 #elif FMT_MID_INTERVAL == 16 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80)) #elif FMT_MID_INTERVAL == 32 -#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) +//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) +#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8) * 80) #elif FMT_MID_INTERVAL == 64 -#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) +//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) +#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6)) #elif FMT_MID_INTERVAL == 128 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24)) #else diff --git a/ksw.c b/ksw.c index 1e584e9..21fb7e8 100644 --- a/ksw.c +++ b/ksw.c @@ -413,15 +413,24 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf) { eh_t *eh; // score array int8_t *qp; // query profile int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int mem_size = qlen * m + (qlen + 1) * 8; assert(h0 > 0); // allocate memory - qp = malloc(qlen * m); - eh = calloc(qlen + 1, 8); + if (mem_size > buf->m) { + buf->m = mem_size; + buf->addr = realloc(buf->addr, mem_size); + } + //qp = malloc(qlen * m); + //eh = calloc(qlen + 1, 8); + qp = (int8_t*)buf->addr; + eh = (eh_t*)(buf->addr + qlen * m); + memset(eh, 0, (qlen + 1) * 8); + // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -505,7 +514,7 @@ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, end = j + 2 < qlen? j + 2 : qlen; //beg = 0; end = qlen; // uncomment this line for debugging } - free(eh); free(qp); + // free(eh); free(qp); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; @@ -514,9 +523,12 @@ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, return max; } -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off, buf_t *buf) { - return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); + if (buf == 0) buf = calloc(1, sizeof(buf_t)); + return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off, buf); + free(buf->addr); + free(buf); } /******************** diff --git a/ksw.h b/ksw.h index 5d45a67..c14caa8 100644 --- a/ksw.h +++ b/ksw.h @@ -2,6 +2,7 @@ #define __AC_KSW_H #include +#include "utils.h" #define KSW_XBYTE 0x10000 #define KSW_XSTOP 0x20000 @@ -104,8 +105,10 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); - int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off, buf_t *buf); + int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off, buf_t *buf); + int ksw_extend2_avx2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, + int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); #ifdef __cplusplus } diff --git a/ksw_extend2_avx2.c b/ksw_extend2_avx2.c new file mode 100644 index 0000000..dc4a8f1 --- /dev/null +++ b/ksw_extend2_avx2.c @@ -0,0 +1,504 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +#undef MAX +#undef MIN +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define SIMD_WIDTH 16 + +typedef struct { size_t m; uint8_t *addr; } buf_t; + +extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, + int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); + +int ksw_extend2_origin(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, + int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); + + +static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { + {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff} +}; + +#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) + +// 初始化变量 +#define SIMD_INIT \ + int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ + __m256i zero_vec; \ + __m256i max_vec; \ + __m256i oe_del_vec; \ + __m256i oe_ins_vec; \ + __m256i e_del_vec; \ + __m256i e_ins_vec; \ + __m256i h_vec_mask[SIMD_WIDTH]; \ + zero_vec = _mm256_setzero_si256(); \ + oe_del_vec = _mm256_set1_epi16(-oe_del); \ + oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ + e_del_vec = _mm256_set1_epi16(-e_del); \ + e_ins_vec = _mm256_set1_epi16(-e_ins); \ + __m256i match_sc_vec = _mm256_set1_epi16(a); \ + __m256i mis_sc_vec = _mm256_set1_epi16(-b); \ + __m256i amb_sc_vec = _mm256_set1_epi16(-1); \ + __m256i amb_vec = _mm256_set1_epi16(4); \ + for (i=0; i 0) { \ + for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) { \ + int pos = SIMD_WIDTH - 1 - (( __builtin_clz(mask)) >> 1); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + } \ + } \ + } + +// 每轮迭代后,交换数组 +#define SWAP_DATA_POINTER \ + int16_t * tmp=hA0; \ + hA0 = hA1; hA1 = hA2; hA2 = tmp; \ + tmp = eA1; eA1 = eA2; eA2 = tmp; \ + tmp = fA1; fA1 = fA2; fA2 = tmp; \ + tmp = mA1; mA1 = mA2; mA2 = tmp; + + +int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int is_left, // 是不是向左扩展 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数SIMD_BTYES + int a, // 碱基match时的分数 + int b, // 碱基mismatch时的惩罚分数(正数) + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值 + buf_t *buf) // 之前已经开辟过的缓存 +{ + //return ksw_extend2_origin(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off); +// fprintf(stderr, "qlen: %d, tlen: %d\n", qlen, tlen); + + if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf); + + int16_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M + int16_t *seq, *ref; + uint8_t *mem; + int16_t *qtmem, *vmem; + int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; + int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int Dloop = tlen + qlen; // 循环跳出条件 + int span, beg1, end1; // 边界条件计算 + int col_size = qlen + 2 + SIMD_WIDTH; + int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 + int mem_size = (seq_size + ref_size) * 2 + val_mem_size; + + SIMD_INIT; // 初始化simd用的数据 + + assert(h0 > 0); + + // allocate memory + //mem = malloc(mem_size); + + if (buf->m < mem_size) { + buf->m = mem_size; + buf->addr = realloc(buf->addr, mem_size); + } + mem = buf->addr; + + qtmem = (int16_t *)&mem[0]; + seq=&qtmem[0]; ref=&qtmem[seq_size]; + if (is_left) { + for (i=0; i>1); i+=SIMD_WIDTH) { + _mm256_storeu_si256((__m256i*)&vmem[i], zero_vec); + } + hA = &vmem[0]; + mA = &vmem[col_size * 3]; + eA = &vmem[col_size * 5]; + fA = &vmem[col_size * 7]; + + hA0 = &hA[0]; hA1 = &hA[col_size]; hA2 = &hA1[col_size]; + mA1 = &mA[0]; mA2 = &mA[col_size]; + eA1 = &eA[0]; eA2 = &eA[col_size]; + fA1 = &fA[0]; fA2 = &fA[col_size]; + + // adjust $w if it is too large + k = m * m; + // get the max score + for (i = 0, max = 0; i < k; ++i) max = max > mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + if (tlen < qlen) w = MIN(tlen - 1, w); + + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;; + max_off = 0; + beg = 1; end = qlen; + // init h0 + hA0[0] = h0; // 左上角 + + if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 + if (w >= qlen) { max_ie = 0; gscore = 0; } + + int m_last=0; + int iend; + + for (D = 1; LIKELY(D < Dloop); ++D) { + // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 + if (D > tlen) { + span = MIN(Dloop-D, w); + beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1); + } else { + span = MIN(D-1, w); + beg1 = MAX(1, ((D-w) / 2) + 1); + } + end1 = MIN(qlen, beg1+span); + + if (beg < beg1) beg = beg1; + if (end > end1) end = end1; + if (beg > end) break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug + + iend = D - (beg - 1); // ref开始计算的位置,倒序 + span = end - beg; + iStart = iend - span - 1; // 0开始的ref索引位置 + + // 每一轮需要记录的数据 + int m = 0, mj = -1, mi = -1; + max_vec = zero_vec; + + // 要处理边界 + // 左边界 处理f (insert) + if (iStart == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); } + // 上边界 + if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); } + else { hA1[beg-1] = 0; eA1[beg-1] = 0; } + + for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 存储结果 + SIMD_STORE; + } + // 剩下的计算单元 + if (j <= end) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 去除多余计算的部分 + SIMD_REMOVE_EXTRA; + // 存储结果 + SIMD_STORE; + } + + SIMD_FIND_MAX; + + // 注意最后跳出循环j的值 + j = end + 1; + + if (j == qlen + 1) { + max_ie = gscore > hA2[qlen] ? max_ie : iStart; + gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; + } + if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点 + if (m > max) { + max = m, max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi)? max_off : abs(mj - mi); + } + else if (zdrop > 0) { + if (mi - max_i > mj - max_j) { + if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break; + } + } + + // 调整计算的边界 + for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; } + beg = j; + for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; } + end = j + 1 <= qlen? j + 1 : qlen; + + m_last = m; + // swap m, h, e, f + SWAP_DATA_POINTER; + } + +// free(mem); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int is_left, // 是不是向左扩展 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数 + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 +{ + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + uint8_t *qmem, *ref, *seq; + assert(h0 > 0); + // allocate memory + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); + qmem = malloc(qlen + tlen); + seq=(uint8_t*)&qmem[0]; ref=(uint8_t*)&qmem[qlen]; + if (is_left) { + for (i=0; i oe_ins? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) + eh[j].h = eh[j-1].h - e_ins; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? +// printf("%d\n", w); + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; + beg = 0, end = qlen; + + for (i = 0; LIKELY(i < tlen); ++i) { + int t, f = 0, h1, m = 0, mj = -1; + int8_t *q = &qp[ref[i] * qlen]; + // apply the band and the constraint (if provided) + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; + //if (end > qlen) end = qlen; 没用 + // compute the first column + if (beg == 0) { + h1 = h0 - (o_del + e_del * (i + 1)); + if (h1 < 0) h1 = 0; + } else h1 = 0; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M" + h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 + h = h > f? h : f; + h1 = h; // save H(i,j) to h1 for the next column + mj = m > h? mj : j; // record the position where max score is achieved + m = m > h? m : h; // m is stored at eh[mj+1] + t = M - oe_del; + t = t > 0? t : 0; + e -= e_del; + e = e > t? e : t; // computed E(i+1,j) + p->e = e; // save E(i+1,j) for the next row + t = M - oe_ins; + t = t > 0? t : 0; + f -= e_ins; + f = f > t? f : t; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } + if (m == 0) break; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } else if (zdrop > 0) { + if (i - max_i > mj - max_j) { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; + } + } + // update beg and end for the next round + for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); + beg = j; + for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); + end = j + 2 < qlen? j + 2 : qlen; + //beg = 0; end = qlen; // uncomment this line for debugging + } + + free(eh); free(qp); free(qmem); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} diff --git a/ksw_extend2_avx2_u8.c b/ksw_extend2_avx2_u8.c new file mode 100644 index 0000000..7c0d49a --- /dev/null +++ b/ksw_extend2_avx2_u8.c @@ -0,0 +1,379 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +#undef MAX +#undef MIN +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define SIMD_WIDTH 32 + +typedef struct { size_t m; uint8_t *addr; } buf_t; + +static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { + {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} +}; + +//static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; +static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}; +#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) +//const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); +// 初始化变量 +#define SIMD_INIT \ + int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ + __m256i zero_vec; \ + __m256i max_vec; \ + __m256i oe_del_vec; \ + __m256i oe_ins_vec; \ + __m256i e_del_vec; \ + __m256i e_ins_vec; \ + __m256i h_vec_mask[SIMD_WIDTH]; \ + __m256i reverse_mask_vec; \ + zero_vec = _mm256_setzero_si256(); \ + oe_del_vec = _mm256_set1_epi8(oe_del); \ + oe_ins_vec = _mm256_set1_epi8(oe_ins); \ + e_del_vec = _mm256_set1_epi8(e_del); \ + e_ins_vec = _mm256_set1_epi8(e_ins); \ + __m256i match_sc_vec = _mm256_set1_epi8(a); \ + __m256i mis_sc_vec = _mm256_set1_epi8(b); \ + __m256i amb_sc_vec = _mm256_set1_epi8(1); \ + __m256i amb_vec = _mm256_set1_epi8(4); \ + reverse_mask_vec = _mm256_loadu_si256((__m256i*) (reverse_mask)); \ + for (i=0; i 0) { \ + for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) { \ + int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + } \ + } \ + } + +// 每轮迭代后,交换数组 +#define SWAP_DATA_POINTER \ + uint8_t * tmp=hA0; \ + hA0 = hA1; hA1 = hA2; hA2 = tmp; \ + tmp = eA1; eA1 = eA2; eA2 = tmp; \ + tmp = fA1; fA1 = fA2; fA2 = tmp; \ + tmp = mA1; mA1 = mA2; mA2 = tmp; + + +int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int is_left, // 是不是向左扩展 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数 + int a, // 碱基match时的分数 + int b, // 碱基mismatch时的惩罚分数(正数) + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值 + buf_t *buf) // 之前已经开辟过的缓存 +{ + uint8_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M + uint8_t *seq, *ref; + uint8_t *mem, *qtmem, *vmem; + int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; + int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int Dloop = tlen + qlen; // 循环跳出条件 + int span, beg1, end1; // 边界条件计算 + int col_size = qlen + 2 + SIMD_WIDTH; + int val_mem_size = (col_size * 9 + 31) >> 5 << 5; // 32字节的整数倍 + int mem_size = seq_size + ref_size + val_mem_size; + + SIMD_INIT; // 初始化simd用的数据 + + assert(h0 > 0); + + // allocate memory + //mem = malloc(mem_size); + if (buf->m < mem_size) { + buf->m = mem_size; + buf->addr = realloc(buf->addr, mem_size); + } + mem = buf->addr; + + qtmem = &mem[0]; + seq=(uint8_t*)&qtmem[0]; ref=(uint8_t*)&qtmem[seq_size]; + if (is_left) { + for (i=0; i mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + if (tlen < qlen) w = MIN(tlen - 1, w); + + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;; + max_off = 0; + beg = 1; end = qlen; + // init h0 + hA0[0] = h0; // 左上角 + + if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 + if (w >= qlen) { max_ie = 0; gscore = 0; } + + int m_last=0; + int iend; + + for (D = 1; LIKELY(D < Dloop); ++D) { + // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 + if (D > tlen) { + span = MIN(Dloop-D, w); + beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1); + } else { + span = MIN(D-1, w); + beg1 = MAX(1, ((D-w) / 2) + 1); + } + end1 = MIN(qlen, beg1+span); + + if (beg < beg1) beg = beg1; + if (end > end1) end = end1; + if (beg > end) break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug + + iend = D - (beg - 1); // ref开始计算的位置,倒序 + span = end - beg; + iStart = iend - span - 1; // 0开始的ref索引位置 + + // 每一轮需要记录的数据 + int m = 0, mj = -1, mi = -1; + max_vec = zero_vec; + + // 要处理边界 + // 左边界 处理f (insert) + if (iStart == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); } + // 上边界 + if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); } + else { hA1[beg-1] = 0; eA1[beg-1] = 0; } + + for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 存储结果 + SIMD_STORE; + } + // 剩下的计算单元 + if (j <= end) { + // 取数据 + SIMD_LOAD; + // 比对seq,计算罚分 + SIMD_CMP_SEQ; + // 计算 + SIMD_COMPUTE; + // 去除多余计算的部分 + SIMD_REMOVE_EXTRA; + // 存储结果 + SIMD_STORE; + } + + SIMD_FIND_MAX; + + // 注意最后跳出循环j的值 + j = end + 1; + + if (j == qlen + 1) { + max_ie = gscore > hA2[qlen] ? max_ie : iStart; + gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; + } + if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点 + if (m > max) { + max = m, max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi)? max_off : abs(mj - mi); + } + else if (zdrop > 0) { + if (mi - max_i > mj - max_j) { + if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break; + } + } + + // 调整计算的边界 + for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; } + beg = j; + for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; } + end = j + 1 <= qlen? j + 1 : qlen; + + m_last = m; + // swap m, h, e, f + SWAP_DATA_POINTER; + } + +// free(mem); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} diff --git a/pemerge.c b/pemerge.c index 725885f..197e87f 100644 --- a/pemerge.c +++ b/pemerge.c @@ -216,9 +216,9 @@ static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cn int main_pemerge(int argc, char *argv[]) { - int c, flag = 0, i, n, min_ovlp = 10; - int64_t cnt[MAX_ERR+1]; - bseq1_t *bseq; + int c, flag = 0, i, n, m = 0, min_ovlp = 10; + int64_t cnt[MAX_ERR + 1], seq_size = 0; + bseq1_t *bseq = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; pem_opt_t *opt; @@ -269,10 +269,12 @@ int main_pemerge(int argc, char *argv[]) } memset(cnt, 0, 8 * (MAX_ERR+1)); - while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq); + while (n > 0) { process_seqs(opt, n, bseq, cnt); - free(bseq); + bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq); } + if (bseq != 0) free(bseq); fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); for (i = 1; i <= MAX_ERR; ++i) diff --git a/run.sh b/run.sh index 961dc81..b4c21f7 100755 --- a/run.sh +++ b/run.sh @@ -6,8 +6,10 @@ thread=1 #reference=~/data/reference/human_g1k_v37_decoy.fasta #n_r1=~/fastq/sn_r1.fq #n_r2=~/fastq/sn_r2.fq -n_r1=~/fastq/ssn_r1.fq -n_r2=~/fastq/ssn_r2.fq +#n_r1=~/fastq/ssn_r1.fq +#n_r2=~/fastq/ssn_r2.fq +n_r1=~/fastq/ERR194147_1_120w.fastq +n_r2=~/fastq/ERR194147_2_120w.fastq #n_r1=~/fastq/tiny_n_r1.fq #n_r2=~/fastq/tiny_n_r2.fq #n_r1=~/fastq/diff_r1.fq @@ -15,9 +17,10 @@ n_r2=~/fastq/ssn_r2.fq #n_r1=~/fastq/d_r1.fq #n_r2=~/fastq/d_r2.fq reference=~/reference/human_g1k_v37_decoy.fasta -out=./ssn.sam +#out=./all.sam +#out=./ssn.sam #out=./out.sam -#out=/dev/null +out=/dev/null #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ # /home/zzh/data/fastq/nm1.fq \ diff --git a/utils.h b/utils.h index 5722787..52107cc 100644 --- a/utils.h +++ b/utils.h @@ -45,7 +45,10 @@ extern int64_t time_ksw_extend2, time_bwt_occ4, time_bwt_sa, time_bwt_sa_read, - time_bns; + time_bns, + time_work1, + time_work2, + time_flt_chain; extern int64_t dn, n16, n17, n18, n19, nall, num_sa; extern int64_t s1n, s2n, s3n, s1l, s2l, s3l; @@ -54,6 +57,11 @@ extern FILE *fp1; #endif +#undef MAX +#undef MIN +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + #ifdef __GNUC__ // Tell GCC to validate printf format string and args #define ATTRIBUTE(list) __attribute__ (list) @@ -77,6 +85,8 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +typedef struct { size_t m; uint8_t *addr; } buf_t; + #ifdef __cplusplus extern "C" { #endif