diff --git a/Makefile b/Makefile index 9679d71..fbf2faa 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ CC= gcc # CFLAGS= -g -Wall -Wno-unused-function -O2 CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS -SHOW_PERF= -DSHOW_PERF +SHOW_PERF= #-DSHOW_PERF AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) -DUSE_AVX2 -DKSW_EQUAL #-DUSE_MT_READ #-DFILTER_FULL_MATCH LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ @@ -17,6 +17,7 @@ PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread -ldl SUBDIRS= . +JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a # -ljemalloc -L/home/zzh/work/jemalloc/lib/ ifeq ($(shell uname -s),Linux) LIBS += -lrt @@ -29,11 +30,11 @@ endif all:$(PROG) -bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) - #bwa:libbwa.a $(AOBJS) main.o -# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) ../sbwa/jemalloc/lib/libjemalloc.a main.o -o $@ -L. -lbwa $(LIBS) +# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) $(JE_MALLOC) main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o $(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS) diff --git a/bntseq.c b/bntseq.c index c844992..c3c246a 100644 --- a/bntseq.c +++ b/bntseq.c @@ -423,6 +423,30 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end return seq; } +void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len, size_t *m_seq, uint8_t **seqp) +{ + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap + if (end > l_pac<<1) end = l_pac<<1; + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + if (*len > *m_seq) { + *m_seq = *len; + *seqp = realloc(*seqp, end - beg); + } + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + (*seqp)[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + (*seqp)[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing +} + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) { int64_t far_beg, far_end, len; @@ -449,3 +473,28 @@ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, in assert(seq && *end - *beg == len); // assertion failure should never happen return seq; } + +void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp) +{ + int64_t far_beg, far_end, len; + int is_rev; + + if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac<<1) - far_end; + far_end = (bns->l_pac<<1) - tmp; + } + *beg = *beg > far_beg? *beg : far_beg; + *end = *end < far_end? *end : far_end; + bns_get_seq_no_alloc(bns->l_pac, pac, *beg, *end, &len, m_seq, seqp); + if (*seqp == 0 || *end - *beg != len) { + fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", + __func__, (long)*beg, (long)mid, (long)*end, (long)len, *seqp, *rid, (long)far_beg, (long)far_end); + } + assert(*seqp && *end - *beg == len); // assertion failure should never happen +} diff --git a/bntseq.h b/bntseq.h index 354adb6..7891d61 100644 --- a/bntseq.h +++ b/bntseq.h @@ -79,7 +79,7 @@ extern "C" { uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); - + void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp); #ifdef __cplusplus } #endif diff --git a/bwa.c b/bwa.c index 26c3e4c..a337995 100644 --- a/bwa.c +++ b/bwa.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "bntseq.h" #include "bwa.h" #include "ksw.h" @@ -58,31 +59,160 @@ static inline void trim_readno(kstring_t *s) s->l -= 2, s->s[s->l] = 0; } -static inline char *dupkstring(const kstring_t *str, int dupempty) +static inline void dupkstring(const kstring_t *str, int dupempty, char **dstp, int *sm) { - char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL; - if (!s) return NULL; + if (!dupempty && str->l == 0) { + if (*dstp) free(*dstp); + *dstp = 0; *sm = 0; + } else if (*dstp == 0 || *sm < str->l) { + *sm = str->l; + *dstp = realloc(*dstp, str->l + 1); + } + + char *s = *dstp; + if (!s) return; memcpy(s, str->s, str->l); s[str->l] = '\0'; - return s; } -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s, int copy_comment) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = dupkstring(&ks->name, 1); - s->comment = dupkstring(&ks->comment, 0); - s->seq = dupkstring(&ks->seq, 1); - s->qual = dupkstring(&ks->qual, 0); + dupkstring(&ks->name, 1, &s->name, &s->m_name); + if (copy_comment) dupkstring(&ks->comment, 0, &s->comment, &s->m_comment); + dupkstring(&ks->seq, 1, &s->seq, &s->m_seq); + dupkstring(&ks->qual, 0, &s->qual, &s->m_qual); s->l_seq = ks->seq.l; } -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +typedef struct { + kseq_t *ks; + bseq1_t *seq; + int start_pos; + int n_bound; + int copy_comment; + int ret_n; + int ret_size; + int ret_status; + int chunk_size; +} read_data_t; + +static void *thread_bseq_read(void *data) { + read_data_t *d = (read_data_t*) data; + kseq_t *ks = d->ks; + bseq1_t *seqs = d->seq; + int copy_comment = d->copy_comment; + int chunk_size = d->chunk_size; + int cur_n = 0, cur_pos = d->start_pos, size = 0; + int ret_status = 1; + + while (cur_n < d->n_bound && (ret_status = kseq_read(ks)) >= 0) { + trim_readno(&ks->name); + kseq2bseq1(ks, seqs + cur_pos, copy_comment); + seqs[cur_pos].id = cur_pos; + size += seqs[cur_pos].l_seq; + cur_pos += 2; cur_n += 1; + if (size >= chunk_size) break; + } + d->ret_n = cur_n; d->ret_size = size; d->ret_status = ret_status; + return 0; +} + +#define READ_ONE_SEQ(ks) \ + trim_readno(&ks->name); \ + kseq2bseq1(ks, &seqs[n], copy_comment); \ + seqs[n].id = n; \ + size += seqs[n++].l_seq; + +// multi thread reading input seqs +void bseq_read_pe_mt(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_comment, int64_t *size_, int *m_, bseq1_t **seqs_ptr) { + kseq_t *ks = (kseq_t *)ks1_, *ks2 = (kseq_t *)ks2_; + int size = 0, m = *m_, n = 0; + bseq1_t *seqs = *seqs_ptr; + read_data_t d[2]; + pthread_t tid[2]; + + if (m == 0) { // 还没开辟空间,要初始化 + seqs = calloc(20, sizeof(bseq1_t)); // 先读取20个reads,根据reads的长度和chunk size决定要读取多少条reads + int ks1_ret = 0, ks2_ret = 0; + int i = 10; + while (i-- > 0) { + ks1_ret = kseq_read(ks); + if (ks1_ret < 0) break; + ks2_ret = kseq_read(ks2); + if (ks2_ret < 0) { + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + READ_ONE_SEQ(ks); + READ_ONE_SEQ(ks2); + } + if (ks1_ret < 0 || ks2_ret < 0) { + if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; *seqs_ptr = seqs; + return; + } + // 重新开辟空间 + m = (chunk_size + size / 10 - 1) / (size / 10) * 2; + *m_ = m; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, sizeof(bseq1_t)); + *seqs_ptr = seqs; + } + + d[0].copy_comment = copy_comment; d[1].copy_comment = copy_comment; + d[0].ks = ks ; d[0].seq = &seqs[0]; d[0].n_bound = (m >> 1) - (n>>1); d[0].start_pos = n; + d[1].ks = ks2; d[1].seq = &seqs[0]; d[1].n_bound = (m >> 1) - (n>>1); d[1].start_pos = n+1; + d[0].chunk_size = d[1].chunk_size = (chunk_size + 1) >> 1; + + pthread_create(&tid[0], 0, thread_bseq_read, &d[0]); + pthread_create(&tid[1], 0, thread_bseq_read, &d[1]); + pthread_join(tid[0], 0); pthread_join(tid[1], 0); + + size += d[0].ret_size + d[1].ret_size; + n += d[0].ret_n + d[1].ret_n; + + if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } else if (size < chunk_size && d[0].ret_status > 0 && d[1].ret_status > 0) { + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, (m-n) * sizeof(bseq1_t)); + } + READ_ONE_SEQ(ks); + if (ks2) { + READ_ONE_SEQ(ks2); + } + if (size >= chunk_size && (n&1) == 0) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + } + + *n_ = n; *size_ = size; + if (m > *m_) *m_ = m; + *seqs_ptr = seqs; +} + +void bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_comment, int64_t *size_, int *m_, bseq1_t **seqs_ptr) +{ +#ifdef USE_MT_READ + if (ks2_) return bseq_read_pe_mt(chunk_size, n_, ks1_, ks2_, copy_comment, size_, m_, seqs_ptr); +#endif kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; + bseq1_t *seqs = *seqs_ptr; + n = 0; m = *m_; while (kseq_read(ks) >= 0) { if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); @@ -91,16 +221,11 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) if (n >= m) { m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); + memset(seqs + n, 0, (m-n) * sizeof(bseq1_t)); } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - seqs[n].id = n; - size += seqs[n++].l_seq; + READ_ONE_SEQ(ks); if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - seqs[n].id = n; - size += seqs[n++].l_seq; + READ_ONE_SEQ(ks2); } if (size >= chunk_size && (n&1) == 0) break; } @@ -108,8 +233,9 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } - *n_ = n; - return seqs; + *n_ = n; *size_ = size; + if (m > *m_) *m_ = m; + *seqs_ptr = seqs; } void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]) @@ -303,7 +429,7 @@ FMTIndex *bwa_idx_load_fmt(const char *hint) sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL); strcpy(fmt_idx_fn, hint); strcpy(fmt_idx_fn + l_hint, suffix); - sprintf(suffix, ".14.xmer"); + sprintf(suffix, ".%d.kmer", HASH_KMER_LEN); strcpy(kmer_idx_fn, hint); strcpy(kmer_idx_fn + l_hint, suffix); @@ -339,6 +465,7 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) return 0; } idx = calloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint); //if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); //idx->bwt->kmer_hash = idx->fmt->kmer_hash; diff --git a/bwa.h b/bwa.h index aa155b6..5e125a8 100644 --- a/bwa.h +++ b/bwa.h @@ -29,6 +29,7 @@ #include #include "bntseq.h" +#include "kstring.h" #include "bwt.h" #include "fmt_idx.h" @@ -59,7 +60,9 @@ typedef struct { typedef struct { int l_seq, id; - char *name, *comment, *seq, *qual, *sam; + int m_name, m_comment, m_seq, m_qual; + char *name, *comment, *seq, *qual; + kstring_t sam; } bseq1_t; extern int bwa_verbose, bwa_dbg; @@ -69,7 +72,8 @@ extern char bwa_rg_id[256]; extern "C" { #endif - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + //bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_comment, int64_t *size_, int *m_, bseq1_t **seqs_ptr); void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); void bwa_fill_scmat(int a, int b, int8_t mat[25]); diff --git a/bwamem.c b/bwamem.c index 8e5d465..41f38ea 100644 --- a/bwamem.c +++ b/bwamem.c @@ -117,170 +117,96 @@ mem_opt_t *mem_opt_init() #define intv_lt(a, b) ((a).info < (b).info) KSORT_INIT(mem_intv, bwtintv_t, intv_lt) -typedef struct { - int *full_match; - bwtintv_v *mem, *mem1, *tmpv[2]; -} smem_aux_t; - -static smem_aux_t *smem_aux_init(int batch_size) +static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *a) { - smem_aux_t *a; - a = calloc(1, sizeof(smem_aux_t)); - a->mem = calloc(batch_size, sizeof(bwtintv_v)); - a->mem1 = calloc(batch_size, sizeof(bwtintv_v)); - a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); - a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); - a->full_match = calloc(batch_size, sizeof(int)); - return a; -} - -static void smem_aux_destroy(smem_aux_t *a, int batch_size) -{ - int i; - free(a->tmpv[0]->a); free(a->tmpv[0]); - free(a->tmpv[1]->a); free(a->tmpv[1]); - for (i = 0; i < batch_size; ++i) { - free(a->mem[i].a); free(a->mem1[i].a); - } - free(a->mem); free(a->mem1); - free(a->full_match); - free(a); -} - -#define USE_FMT 1 - -static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, bseq1_t *seq_arr, int nseq, smem_aux_t *a) -{ - int si, i, k, x, old_n, len, slen, start, end; + int i, k, x = 0, old_n; int start_width = 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); - int max_seed_len, start_N_num, start_flag; - uint8_t *seq; - bwtintv_v *mem1, *mem; - bwtintv_t *p; + int max_seed_len = 0; + int start_N_num = 0, start_flag = 1; + a->mem.n = 0; - // 1. first pass: find all SMEMs #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - for (si = 0; si < nseq; ++si) { - x = 0; max_seed_len = 0; start_N_num = 0; start_flag = 1; - len = seq_arr[si].l_seq; seq = (uint8_t*) seq_arr[si].seq; - mem1 = &a->mem1[si]; mem = &a->mem[si]; mem->n = 0; - while (x < len) { - if (seq[x] < 4) { - start_flag = 0; -#if USE_FMT - x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, mem1, a->tmpv[0]); -#else - x = bwt_smem1(bwt, len, seq, x, start_width, mem1, a->tmpv); -#endif - for (i = 0; i < mem1->n; ++i) { - p = &mem1->a[i]; - slen = (uint32_t)p->info - (p->info >> 32); // seed length - max_seed_len = MAX(max_seed_len, slen); - if (slen >= opt->min_seed_len) { - kv_push(bwtintv_t, *mem, *p); - } + // 1. first pass: find all SMEMs + while (x < len) { + if (seq[x] < 4) { + start_flag = 0; + x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, &a->mem1, a->tmpv[0]); + for (i = 0; i < a->mem1.n; ++i) { + bwtintv_t *p = &a->mem1.a[i]; + int slen = (uint32_t)p->info - (p->info >> 32); // seed length + max_seed_len = MAX(max_seed_len, slen); + if (slen >= opt->min_seed_len) { + kv_push(bwtintv_t, a->mem, *p); } - } else { - ++x; - if (start_flag) ++start_N_num; } + //break; // for test full match time + } else { + ++x; + if (start_flag) ++start_N_num; } - if (max_seed_len == len - start_N_num) a->full_match[si] = 1; } #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_seed_1, tmp_time); -#endif - -#ifdef SHOW_PERF tmp_time = realtime_msec(); #endif - // 2. second pass: find MEMs inside a long SMEM - for (si = 0; si < nseq; ++si) { - len = seq_arr[si].l_seq; seq = (uint8_t*) seq_arr[si].seq; - mem1 = &a->mem1[si]; mem = &a->mem[si]; - old_n = mem->n; -// if (a->full_match[si]) continue; - for (k = 0; k < old_n; ++k) { - p = &mem->a[k]; - start = p->info >> 32; end = (int32_t)p->info; - if (end - start < split_len || p->x[2] > opt->split_width) continue; -#if USE_FMT - fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, mem1, a->tmpv[0]); -#else - bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, mem1, a->tmpv); + +#ifdef FILTER_FULL_MATCH + if (max_seed_len == len - start_N_num) goto collect_intv_end; #endif - for (i = 0; i < mem1->n; ++i) { - p = &mem1->a[i]; - slen = (uint32_t)p->info - (p->info >> 32); - if (slen >= opt->min_seed_len) { - kv_push(bwtintv_t, *mem, *p); - } + + // 2. second pass: find MEMs inside a long SMEM + old_n = a->mem.n; + for (k = 0; k < old_n; ++k) { + bwtintv_t *p = &a->mem.a[k]; + int start = p->info>>32, end = (int32_t)p->info; + if (end - start < split_len || p->x[2] > opt->split_width) continue; + fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, &a->mem1, a->tmpv[0]); + for (i = 0; i < a->mem1.n; ++i) { + bwtintv_t *p = &a->mem1.a[i]; + int slen = (uint32_t)p->info - (p->info >> 32); + if (slen >= opt->min_seed_len) { + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } } } #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_seed_2, tmp_time); -#endif - -#ifdef SHOW_PERF tmp_time = realtime_msec(); #endif // 3. third pass: LAST-like if (opt->max_mem_intv > 0) { - for (si = 0; si < nseq; ++si) { - // if (a->full_match[si]) continue; - len = seq_arr[si].l_seq; seq = (uint8_t*) seq_arr[si].seq; - x = 0; mem = &a->mem[si]; - while (x < len) { - if (seq[x] < 4) { - bwtintv_t m; -#if USE_FMT - x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); -#else - x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); -#endif - if (m.x[2] > 0) { - kv_push(bwtintv_t, *mem, m); - } - } else ++x; - } + x = 0; + while (x < len) { + if (seq[x] < 4) { + bwtintv_t m; + x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); + if (m.x[2] > 0) { + kv_push(bwtintv_t, a->mem, m); + } + } else ++x; } } #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_seed_3, tmp_time); #endif - // 4. sort - for (si = 0; si < nseq; ++si) { - ks_introsort(mem_intv, a->mem[si].n, a->mem[si].a); - } + +#ifdef FILTER_FULL_MATCH +collect_intv_end: +#endif + // sort + ks_introsort(mem_intv, a->mem.n, a->mem.a); } /************ * Chaining * ************/ -typedef struct { - int64_t rbeg; - int32_t qbeg, len; - int score; -} mem_seed_t; // unaligned memory - -typedef struct { - int n, m, first, rid; - uint32_t w:29, kept:2, is_alt:1; - float frac_rep; - int64_t pos; - mem_seed_t *seeds; -} mem_chain_t; - -typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; - #include "kbtree.h" #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) @@ -289,6 +215,7 @@ KBTREE_INIT(chn, mem_chain_t, chain_cmp) // return 1 if the seed is merged into the chain static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) { + //return 0; int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; @@ -297,8 +224,12 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand + + //return 0; //到这里都一样的 x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; + //fprintf(stderr, "x: %ld, y: %ld\n", x, y); + // 这里的代码是不稳定的,因为他只用当前的seed和chain里的last做比较,而chain里所有的seed都是无序的,他们的顺序跟处理seed的先后有关 if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; @@ -348,21 +279,28 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -void mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, bseq1_t *seq_arr, int nseq, mem_chain_v *chns, void *buf) +#define uint64_lt(a, b) ((a) > (b)) +KSORT_INIT(uint64_sort, uint64_t, uint64_lt) +mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) { - int si, i, b, e, l_rep, len; - int64_t l_pac = bns->l_pac, k; - mem_chain_v *chain; + int i, b, e, l_rep; + int64_t l_pac = bns->l_pac; + mem_chain_v chain; kbtree_t(chn) *tree; smem_aux_t *aux; - bwtintv_v *mem; + + uint64_v v1, v2; + kv_init(v1); + kv_init(v2); + + kv_init(chain); + if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); - aux = (smem_aux_t*)buf; - // 1. find smem - mem_collect_intv(opt, bwt, fmt, seq_arr, nseq, aux); - // 2. chain + aux = (smem_aux_t*)buf; + mem_collect_intv(opt, fmt, len, seq, aux); + #define CHECK_ADD_CHAIN(tmp, lower, upper) \ int rid, to_add = 0; \ mem_chain_t tmp, *lower, *upper; \ @@ -391,64 +329,92 @@ void mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, cons kb_putp(chn, tree, &tmp); \ } - for (si = 0; si < nseq; ++si) { - tree->n_keys = 0; - tree->n_nodes = 1; - tree->root->n = 0; - tree->root->is_internal = 0; - //tree = kb_init(chn, KB_DEFAULT_SIZE); - - len = seq_arr[si].l_seq; - mem = &aux->mem[si]; - for (i = 0, b = e = l_rep = 0; i < mem->n; ++i) { // compute frac_rep - bwtintv_t *p = &mem->a[i]; - int sb = (p->info>>32), se = (uint32_t)p->info; - if (p->x[2] <= opt->max_occ) continue; - if (sb > e) l_rep += e - b, b = sb, e = se; - else e = e > se? e : se; - } - l_rep += e - b; - - for (i = 0; i < mem->n; ++i) { - bwtintv_t *p = &mem->a[i]; - int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length - if (p->num_match > 0) { - mem_seed_t s; - s.rbeg = p->rm[0].rs; - if (p->rm[0].reverse) s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg; - CHECK_ADD_CHAIN(tmp, lower, upper); - } else { - step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1; - for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { - mem_seed_t s; -#ifdef SHOW_PERF - int64_t tmp_time = realtime_msec(); -#endif -#if USE_FMT - s.rbeg = fmt_sa(fmt, p->x[0] + k); -#else - s.rbeg = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference -#endif -#ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_sa, tmp_time); -#endif - CHECK_ADD_CHAIN(tmp, lower, upper); - } - } - } - - chain = &chns[si]; - kv_resize(mem_chain_t, *chain, kb_size(tree)); -#define traverse_func(p_) (chain->a[chain->n++] = *(p_)) - __kb_traverse(mem_chain_t, tree, traverse_func); -#undef traverse_func - - for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len; - if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); -// kb_destroy(chn, tree); + for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep + bwtintv_t *p = &aux->mem.a[i]; + int sb = (p->info>>32), se = (uint32_t)p->info; + if (p->x[2] <= opt->max_occ) continue; + if (sb > e) l_rep += e - b, b = sb, e = se; + else e = e > se? e : se; } + l_rep += e - b; + for (i = 0; i < aux->mem.n; ++i) { + bwtintv_t *p = &aux->mem.a[i]; + int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + + if (p->num_match > 0) { + mem_seed_t s; + s.rbeg = p->rm[0].rs; + if (p->rm[0].reverse) s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg; + CHECK_ADD_CHAIN(tmp, lower, upper); + } else { + step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; + //step = 1; + for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { + mem_seed_t s; +#ifdef SHOW_PERF + int64_t tmp_time = realtime_msec(); +#endif + s.rbeg = fmt_sa(fmt, p->x[0] + k); + //kv_push(uint64_t, v1, s.rbeg); + //fprintf(stderr, "%ld\t", s.rbeg); + //if (p->x[0] == 0) + // s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + p->x[2] - 1 - k); + //else + // s.rbeg = fmt_sa(fmt, p->x[0] + k); + //kv_push(uint64_t, v2, s.rbeg); + //fprintf(stderr, "%ld\t", s.rbeg); + //s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + k); + //fprintf(stderr, "%ld\n", s.rbeg); + //if (s.rbeg < fmt->l_pac) { // 在正链 + // s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg; + //} else { // 在反向互补链 + // s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg; + //} + //s.rbeg = fmt_sa(fmt, p->x[0] + k); +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_bwt_sa, tmp_time); +#endif + CHECK_ADD_CHAIN(tmp, lower, upper); + } + //fprintf(stderr, "\n"); + } + } + //fprintf(stderr, "split\n"); + kv_resize(mem_chain_t, chain, kb_size(tree)); + + #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); + #undef traverse_func + + for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; + if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); kb_destroy(chn, tree); + +#if 0 + for (i = 0; i < chain.n; ++i) + { + fprintf(gfp1, "chain-begin: %d\n", i); + int j; + fprintf(gfp1, "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid); + for (j = 0; j < chain.a[i].n; ++j) { + fprintf(gfp1, "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len); + } + fprintf(gfp1, "chain-end\n"); + } + fprintf(gfp1, "fun-end\n"); +#endif + // ks_introsort(uint64_sort, v1.n, v1.a); + // ks_introsort(uint64_sort, v2.n, v2.a); + // for (i = 0; i < v1.n; ++i) { + // //fprintf(stderr, "%ld\t%ld\n", v1.a[i], v2.a[i]); + // if (v1.a[i] != v2.a[i]) { + // fprintf(stderr, "diff: %ld\t%ld\n", v1.a[i], v2.a[i]); + // } + // } + + return chain; } /******************** @@ -705,7 +671,7 @@ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id #define MEM_MINSC_COEF 5.5f #define MEM_SEEDSW_COEF 0.05f -int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s) +int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s, buf_t *buf) { int qb, qe, rid; int64_t rb, re, mid, l_pac = bns->l_pac; @@ -727,12 +693,14 @@ int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, i if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; // the seed seems good enough; no need to do SW rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid); + //bns_fetch_seq_no_alloc(bns, pac, &rb, mid, &re, &rid, &buf->m, &buf->addr); rseq = buf->addr; + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); return x.score; } -void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a) +void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a, buf_t *buf) { double min_l = opt->min_chain_weight? MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query); int i, j, k, min_HSP_score = (int)(opt->a * min_l + .499); @@ -741,7 +709,7 @@ void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint mem_chain_t *c = &a[i]; for (j = k = 0; j < c->n; ++j) { mem_seed_t *s = &c->seeds[j]; - s->score = mem_seed_sw(opt, bns, pac, l_query, query, s); + s->score = mem_seed_sw(opt, bns, pac, l_query, query, s, buf); if (s->score < 0 || s->score >= min_HSP_score) { s->score = s->score < 0? s->len * opt->a : s->score; c->seeds[k++] = *s; @@ -766,13 +734,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) #define MAX_BAND_TRY 2 -void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf) { int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; uint64_t *srt; + smem_aux_t *aux = (smem_aux_t*)buf; if (c->n == 0) return; // get the max possible span @@ -794,6 +763,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac } // retrieve the reference sequence rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); + //bns_fetch_seq_no_alloc(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid, &aux->seq_buf->m, &aux->seq_buf->addr); rseq = aux->seq_buf->addr; + assert(c->rid == rid); srt = malloc(c->n * 8); @@ -850,26 +821,31 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name); if (s->qbeg) { // left extension - uint8_t *rs, *qs; int qle, tle, gtle, gscore; + tmp = s->rbeg - rmax[0]; +#ifndef USE_AVX2 + uint8_t *rs, *qs; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; - tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; +#endif for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; if (bwa_verbose >= 4) { int j; - printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); - printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); + printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rseq[tmp - 1 - j]]); putchar('\n'); + printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)query[s->qbeg - 1 - j]]); putchar('\n'); } #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif -// a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); +#ifndef USE_AVX2 + a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); +#else + a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_ksw_extend2, tmp_time); @@ -885,7 +861,9 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac a->qb = 0, a->rb = s->rbeg - gtle; a->truesc = gscore; } +#ifndef USE_AVX2 free(qs); free(rs); +#endif } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension @@ -904,8 +882,11 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - //a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); +#ifndef USE_AVX2 + a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); +#else + a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_ksw_extend2, tmp_time); @@ -935,7 +916,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac a->frac_rep = c->frac_rep; } - free(srt); free(rseq); + free(srt); + free(rseq); } /***************************** @@ -1160,7 +1142,7 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a) void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); - kstring_t str; + // kstring_t str; kvec_t(mem_aln_t) aa; int k, l; char **XA = 0; @@ -1168,7 +1150,8 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, if (!(opt->flag & MEM_F_ALL)) XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); - str.l = str.m = 0; str.s = 0; + // str.l = str.m = 0; str.s = 0; + s->sam.l = 0; for (k = l = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; @@ -1191,65 +1174,56 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; - mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); + mem_aln2sam(opt, bns, &s->sam, s, 1, &t, 0, m); } else { for (k = 0; k < aa.n; ++k) - mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); + mem_aln2sam(opt, bns, &s->sam, s, aa.n, aa.a, k, m); for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); free(aa.a); } - s->sam = str.s; + // s->sam = str.s; if (XA) { for (k = 0; k < a->n; ++k) free(XA[k]); free(XA); } } -void mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *seq_arr, int nseq, mem_chain_v *chns, mem_alnreg_v *regs, void *buf) +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) { - int si, i, j; - // 1. 将seq都转成0,1,2,3 - for (si = 0; si < nseq; ++si) { // convert to 2-bit encoding if we have not done so - const int l_seq = seq_arr[si].l_seq; - char *seq = seq_arr[si].seq; - for (j = 0; j < l_seq; ++j) seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]]; + int i; + mem_chain_v chn; + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + + chn = mem_chain(opt, fmt, bns, l_seq, (uint8_t*)seq, buf); + chn.n = mem_chain_flt(opt, chn.n, chn.a); + mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t *)seq, chn.n, chn.a, ((smem_aux_t *)buf)->seq_buf); + if (bwa_verbose >= 4) mem_print_chain(bns, &chn); + + kv_init(regs); + for (i = 0; i < chn.n; ++i) { + mem_chain_t *p = &chn.a[i]; + if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); + mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s, buf); + free(chn.a[i].seeds); } - - // 2. find smem and chain - mem_chain(opt, bwt, fmt, bns, seq_arr, nseq, chns, buf); - - // 3. filter chain - for (si = 0; si < nseq; ++si) { - chns[si].n = mem_chain_flt(opt, chns[si].n, chns[si].a); - mem_flt_chained_seeds(opt, bns, pac, seq_arr[si].l_seq, (uint8_t *)seq_arr[si].seq, chns[si].n, chns[si].a); - if (bwa_verbose >= 4) mem_print_chain(bns, &chns[si]); - } - - // 4. extend - for (si = 0; si < nseq; ++si) { - mem_chain_v *chn = &chns[si]; - for (i = 0; i < chn->n; ++i) { - mem_chain_t *p = &chn->a[i]; - if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - mem_chain2aln(opt, bns, pac, seq_arr[si].l_seq, (uint8_t *)seq_arr[si].seq, p, ®s[si]); - free(chn->a[i].seeds); - } - free(chn->a); - regs[si].n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t *)seq_arr[si].seq, regs[si].n, regs[si].a); - - if (bwa_verbose >= 4) { - err_printf("* %ld chains remain after removing duplicated chains\n", regs[si].n); - for (i = 0; i < regs[si].n; ++i) { - mem_alnreg_t *p = ®s[si].a[i]; - printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); - } - } - for (i = 0; i < regs[si].n; ++i) { - mem_alnreg_t *p = ®s[si].a[i]; - if (p->rid >= 0 && bns->anns[p->rid].is_alt) - p->is_alt = 1; + free(chn.a); + regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a); + if (bwa_verbose >= 4) { + err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); } } + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + if (p->rid >= 0 && bns->anns[p->rid].is_alt) + p->is_alt = 1; + } + return regs; } mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) @@ -1324,44 +1298,25 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * return a; } -typedef struct { - int num_reads; - const mem_opt_t *opt; - const bwt_t *bwt; - const FMTIndex *fmt; - const bntseq_t *bns; - const uint8_t *pac; - const mem_pestat_t *pes; - smem_aux_t **aux; - bseq1_t *seqs; - mem_chain_v *chns; - mem_alnreg_v *regs; - int64_t n_processed; -} worker_t; - static void worker1(void *data, int i, int tid) { - worker_t *w = (worker_t*)data; - int start = i * w->opt->batch_size; - int end = MIN(start + w->opt->batch_size, w->num_reads); - mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->chns + start, w->regs + start, w->aux[tid]); - //if (!(w->opt->flag&MEM_F_PE)) { - // if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); - // w->regs[i] = mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); - // - //} else { - // if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); - // w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); - // if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); - // w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); - //} + mem_worker_t *w = (mem_worker_t*)data; + if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); + w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); + } else { + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); + } } static void worker2(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); - worker_t *w = (worker_t*)data; + mem_worker_t *w = (mem_worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); @@ -1375,51 +1330,73 @@ static void worker2(void *data, int i, int tid) } } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +static smem_aux_t *smem_aux_init() { + smem_aux_t *a; + a = calloc(1, sizeof(smem_aux_t)); + a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); + a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); + a->sw_buf = calloc(1, sizeof(buf_t)); + a->seq_buf = calloc(1, sizeof(buf_t)); + return a; +} + +static void smem_aux_destroy(smem_aux_t *a) +{ + free(a->tmpv[0]->a); free(a->tmpv[0]); + free(a->tmpv[1]->a); free(a->tmpv[1]); + free(a->mem.a); free(a->mem1.a); + free(a); +} + +// 初始化线程需要的数据 +mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac) +{ + int i; + mem_worker_t *w = calloc(1, sizeof(mem_worker_t)); + w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt; + w->aux = malloc(opt->n_threads * sizeof(smem_aux_t)); + for (i = 0; i < opt->n_threads; ++i) + w->aux[i] = smem_aux_init(); + return w; +} + +void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +{ + extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); + mem_pestat_t pes[4]; + double ctime, rtime; + + ctime = cputime(); rtime = realtime(); + global_bns = w->bns; + + if (w->n < n) { + w->n = n; + w->regs = realloc(w->regs, n * sizeof(mem_alnreg_v)); + } + w->seqs = seqs; w->n_processed = n_processed; + w->pes = &pes[0]; + #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); - worker_t w; - mem_pestat_t pes[4]; - double ctime, rtime; - int i; - int n_batch = (n + opt->batch_size - 1) / opt->batch_size; - w.num_reads = n; - - ctime = cputime(); rtime = realtime(); - global_bns = bns; - w.regs = calloc(n, sizeof(mem_alnreg_v)); - w.chns = calloc(n, sizeof(mem_chain_v)); - w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; - w.seqs = seqs; w.n_processed = n_processed; - w.pes = &pes[0]; w.fmt = fmt; - w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); - for (i = 0; i < opt->n_threads; ++i) - w.aux[i] = smem_aux_init(opt->batch_size); - - //kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions - - kt_for(opt->n_threads, worker1, &w, n_batch); // find mapping positions - - for (i = 0; i < opt->n_threads; ++i) - smem_aux_destroy(w.aux[i], opt->batch_size); - free(w.aux); - free(w.chns); + kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_work1, tmp_time); + tmp_time = realtime_msec(); +#endif if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 - else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data + else mem_pestat(opt, w->bns->l_pac, n, w->regs, pes); // otherwise, infer the insert size distribution from data } - - kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - free(w.regs); - - if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); + kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_core_process, tmp_time); + __sync_fetch_and_add(&time_work2, tmp_time); #endif + //free(w.regs); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); } diff --git a/bwamem.h b/bwamem.h index d6e2730..4a615c0 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,6 +31,7 @@ #include "bntseq.h" #include "bwa.h" #include "fmt_idx.h" +#include "utils.h" #define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 @@ -71,7 +72,6 @@ typedef struct { int max_occ; // skip a seed if its occurence is larger than this value int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed int n_threads; // number of threads - int batch_size; // batch size of seqs to process at one time int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain @@ -125,9 +125,46 @@ typedef struct { // This struct is only used for the convenience of API. int score, sub, alt_sc; } mem_aln_t; +typedef struct { + int64_t rbeg; + int32_t qbeg, len; + int score; +} mem_seed_t; // unaligned memory + +typedef struct { + int n, m, first, rid; + uint32_t w:29, kept:2, is_alt:1; + float frac_rep; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + +typedef struct { + bwtintv_v mem, mem1, *tmpv[2]; + buf_t *sw_buf, *seq_buf; +} smem_aux_t; + +typedef struct { + const mem_opt_t *opt; + const bwt_t *bwt; + const FMTIndex *fmt; + const bntseq_t *bns; + const uint8_t *pac; + const mem_pestat_t *pes; + smem_aux_t **aux; + bseq1_t *seqs; + mem_alnreg_v *regs; + int64_t n_processed; + int64_t n; +} mem_worker_t; + + #ifdef __cplusplus extern "C" { #endif + mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac); smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); @@ -160,7 +197,7 @@ extern "C" { * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); /** * Find the aligned regions for one query sequence diff --git a/bwamem_pair.c b/bwamem_pair.c index ef79521..bba5c2b 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -377,12 +377,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co aa[i][n_aa[i]++] = g[i]; } } + s[0].sam.l = 0; for (i = 0; i < n_aa[0]; ++i) - mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits - s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(opt, bns, &s[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits + s[1].sam.l = 0; for (i = 0; i < n_aa[1]; ++i) - mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits - s[1].sam = str.s; + mem_aln2sam(opt, bns, &s[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); // free for (i = 0; i < 2; ++i) { diff --git a/bwt.c b/bwt.c index e3b59d1..7020456 100644 --- a/bwt.c +++ b/bwt.c @@ -116,6 +116,7 @@ inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos) // 获取kmer对应的fmt匹配信息, pos should be [0, 13] inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos) { +#if HASH_KMER_LEN == 14 if (pos == 13) kmer_getval_at(kmer_hash->ke14[qbit].intv_arr, ok, 0); else if (pos == 12) @@ -126,6 +127,23 @@ inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit kmer_getval_at(kmer_hash->ke11[qbit >> 6].intv_arr, ok, 0); else kmer_getval_at(kmer_hash->ke10[qbit >> 8].intv_arr, ok, pos); +#elif HASH_KMER_LEN == 13 + if (pos == 12) + kmer_getval_at(kmer_hash->ke13[qbit].intv_arr, ok, 0); + else if (pos == 11) + kmer_getval_at(kmer_hash->ke12[qbit >> 2].intv_arr, ok, 0); + else if (pos == 10) + kmer_getval_at(kmer_hash->ke11[qbit >> 4].intv_arr, ok, 0); + else + kmer_getval_at(kmer_hash->ke10[qbit >> 6].intv_arr, ok, pos); +#else + if (pos == 11) + kmer_getval_at(kmer_hash->ke12[qbit].intv_arr, ok, 0); + else if (pos == 10) + kmer_getval_at(kmer_hash->ke11[qbit >> 2].intv_arr, ok, 0); + else + kmer_getval_at(kmer_hash->ke10[qbit >> 4].intv_arr, ok, pos); +#endif } // bwt->bwt and bwt->occ must be precalculated diff --git a/bwt.h b/bwt.h index 39b3e46..cb76989 100644 --- a/bwt.h +++ b/bwt.h @@ -45,6 +45,8 @@ typedef unsigned char ubyte_t; typedef uint64_t bwtint_t; +//#define HASH_KMER_LEN 12 +//#define HASH_KMER_LEN 13 #define HASH_KMER_LEN 14 // 用来保存kmer对应的fmt的位置信息 diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index d225187..5a996d4 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -130,7 +130,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; @@ -158,7 +158,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; @@ -728,10 +728,11 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c { gzFile fp, fp2; kseq_t *ks, *ks2; - int l, is_pe = 0, i, n; + int l, is_pe = 0, i, n, m = 0; + int64_t seq_size = 0; uint8_t *pac; bsw2seq_t *_seq; - bseq1_t *bseq; + bseq1_t *bseq = 0; pac = calloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) @@ -745,7 +746,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq); + while (n > 0) { int size = 0; if (n > _seq->max) { _seq->max = n; @@ -761,10 +763,11 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c size += p->l; } fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); - free(bseq); process_seqs(_seq, opt, bns, pac, target, is_pe); + bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq); } // free + if (bseq) free(bseq); free(pac); free(_seq->seq); free(_seq); kseq_destroy(ks); diff --git a/debug.sh b/debug.sh index b906d4e..6ba3248 100755 --- a/debug.sh +++ b/debug.sh @@ -12,6 +12,8 @@ thread=1 #n_r2=~/fastq/tiny_n_r2.fq n_r1=~/fastq/diff_r1.fq n_r2=~/fastq/diff_r2.fq +#n_r1=~/fastq/diff_all_r1.fq +#n_r2=~/fastq/diff_all_r2.fq #n_r1=~/fastq/d_r1.fq #n_r2=~/fastq/d_r2.fq reference=~/reference/human_g1k_v37_decoy.fasta diff --git a/fastmap.c b/fastmap.c index 7eb89a3..3d953d1 100644 --- a/fastmap.c +++ b/fastmap.c @@ -57,12 +57,14 @@ int64_t time_ksw_extend2 = 0, time_bwt_sa = 0, time_bwt_sa_read = 0, time_bns = 0, - time_core_process = 0; + time_work1 = 0, + time_work2 = 0, + time_flt_chain = 0; int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0; int64_t s1n = 0, s2n = 0, s3n = 0, s1l = 0, s2l = 0, s3l = 0; int64_t g_num_smem2 = 0; -FILE *fp1; +FILE *gfp1, *gfp2, *gfp3; #endif @@ -72,6 +74,12 @@ void *kopen(const char *fn, int *_fd); int kclose(void *a); void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); +typedef struct { + int n_seqs; + int m_seqs; + bseq1_t *seqs; +} ktp_data_t; + typedef struct { kseq_t *ks, *ks2; mem_opt_t *opt; @@ -79,40 +87,30 @@ typedef struct { int64_t n_processed; int copy_comment, actual_chunk_size; bwaidx_t *idx; + mem_worker_t *w; + int data_idx; // pingpong buffer index + ktp_data_t *data; } ktp_aux_t; -typedef struct { - ktp_aux_t *aux; - int n_seqs; - bseq1_t *seqs; -} ktp_data_t; - static void *process(void *shared, int step, void *_data) { ktp_aux_t *aux = (ktp_aux_t*)shared; ktp_data_t *data = (ktp_data_t*)_data; int i; if (step == 0) { - ktp_data_t *ret; + ktp_data_t *ret = aux->data + aux->data_idx; + aux->data_idx = !aux->data_idx; int64_t size = 0; - ret = calloc(1, sizeof(ktp_data_t)); - ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2); - if (ret->seqs == 0) { - free(ret); + bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2, aux->copy_comment, &size, &ret->m_seqs, &ret->seqs); + if (ret->n_seqs == 0) { return 0; } - if (!aux->copy_comment) - for (i = 0; i < ret->n_seqs; ++i) { - free(ret->seqs[i].comment); - ret->seqs[i].comment = 0; - } - for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size); return ret; } else if (step == 1) { const mem_opt_t *opt = aux->opt; - const bwaidx_t *idx = aux->idx; + // const bwaidx_t *idx = aux->idx; if (opt->flag & MEM_F_SMARTPE) { bseq1_t *sep[2]; int n_sep[2]; @@ -122,28 +120,25 @@ static void *process(void *shared, int step, void *_data) fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); if (n_sep[0]) { tmp_opt.flag &= ~MEM_F_PE; - mem_process_seqs(&tmp_opt, idx->bwt, idx->fmt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0); + mem_process_seqs(&tmp_opt, aux->w, aux->n_processed, n_sep[0], sep[0], 0); for (i = 0; i < n_sep[0]; ++i) data->seqs[sep[0][i].id].sam = sep[0][i].sam; } if (n_sep[1]) { tmp_opt.flag |= MEM_F_PE; - mem_process_seqs(&tmp_opt, idx->bwt, idx->fmt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); + mem_process_seqs(&tmp_opt, aux->w, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); for (i = 0; i < n_sep[1]; ++i) data->seqs[sep[1][i].id].sam = sep[1][i].sam; } free(sep[0]); free(sep[1]); } else - mem_process_seqs(opt, idx->bwt, idx->fmt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0); + mem_process_seqs(opt, aux->w, aux->n_processed, data->n_seqs, data->seqs, aux->pes0); aux->n_processed += data->n_seqs; return data; } else if (step == 2) { for (i = 0; i < data->n_seqs; ++i) { - if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout); - free(data->seqs[i].name); free(data->seqs[i].comment); - free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam); + if (data->seqs[i].sam.l) err_fputs(data->seqs[i].sam.s, stdout); } - free(data->seqs); free(data); return 0; } return 0; @@ -167,8 +162,11 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) int main_mem(int argc, char *argv[]) { - fp1 = fopen("./test_out.txt", "w"); - +#ifdef SHOW_PERF + gfp1 = fopen("./fp1.txt", "w"); + gfp2 = fopen("./fp2.txt", "w"); + gfp3 = fopen("./fp3.txt", "w"); +#endif mem_opt_t *opt, opt0; int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0; int fixed_chunk_size = -1; @@ -185,7 +183,7 @@ int main_mem(int argc, char *argv[]) aux.opt = opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:b:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) { + while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == '1') no_mt_io = 1; else if (c == 'x') mode = optarg; @@ -195,7 +193,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; - else if (c == 'b') opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1? opt->batch_size : 512; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE; @@ -294,13 +291,11 @@ int main_mem(int argc, char *argv[]) } if (opt->n_threads < 1) opt->n_threads = 1; - if (opt->batch_size < 1) opt->batch_size = 512; if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Algorithm options:\n\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -b INT batch size of reads to process at one time [%d]\n", opt->batch_size); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); @@ -421,7 +416,11 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - + + //fprintf(stderr, "%ld %ld %ld %ld %ld\n", aux.idx->fmt->L2[0], aux.idx->fmt->L2[1], aux.idx->fmt->L2[2], aux.idx->fmt->L2[3], aux.idx->fmt->L2[4]); + //exit(0); + aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->bns, aux.idx->pac); + aux.data = calloc(2, sizeof(ktp_data_t)); bwa_print_sam_hdr(aux.idx->bns, hdr_line); aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; @@ -438,24 +437,19 @@ int main_mem(int argc, char *argv[]) #ifdef SHOW_PERF fprintf(stderr, "\n"); + fprintf(stderr, "time_work1: %f s\n", time_work1 / 1000.0 / 1); + fprintf(stderr, "time_work2: %f s\n", time_work2 / 1000.0 / 1); + fprintf(stderr, "time_flt_chain: %f s\n", time_flt_chain / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads); - fprintf(stderr, "time_core_process: %f s\n", time_core_process / 1000.0 / opt->n_threads); - fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads); - fprintf(stderr, "s1 num: %ld\n", s1n); - fprintf(stderr, "s2 num: %ld\n", s2n); - fprintf(stderr, "s3 num: %ld\n", s3n); -// fprintf(stderr, "s1 len: %ld\n", s1l / s1n); -// fprintf(stderr, "s2 len: %ld\n", s2l / s2n); -// fprintf(stderr, "s3 len: %ld\n", s3l / s3n); - fprintf(stderr, "get sa num: %ld\n", num_sa); - fprintf(stderr, "seed 2 num: %ld\n", g_num_smem2); fprintf(stderr, "\n"); - fclose(fp1); + fclose(gfp1); + fclose(gfp2); + fclose(gfp3); #endif return 0; diff --git a/fmt_idx.c b/fmt_idx.c index 36f5245..b00e385 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -144,12 +144,16 @@ KmerHash fmt_restore_kmer_idx(const char *fn) len = 1 << (12 << 1); kh->ke12 = (KmerEntry *)malloc(len * sizeof(KmerEntry)); fread_fix(fp, len * sizeof(KmerEntry), kh->ke12); +#if HASH_KMER_LEN > 12 len = 1 << (13 << 1); kh->ke13 = (KmerEntry *)malloc(len * sizeof(KmerEntry)); fread_fix(fp, len * sizeof(KmerEntry), kh->ke13); +#endif +#if HASH_KMER_LEN > 13 len = 1 << (14 << 1); kh->ke14 = (KmerEntry *)malloc(len * sizeof(KmerEntry)); fread_fix(fp, len * sizeof(KmerEntry), kh->ke14); +#endif err_fclose(fp); return khash; } @@ -414,6 +418,109 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[3] += x >> 24 & 0xff; } +// 扩展两个个碱基,计算bwt base为b的pre-bwt str中各个碱基的occ +inline void fmt_direct_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) +{ + uint32_t x = 0; + uint32_t *p, *q, tmp; + bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); // cp = check point + int ti = b1 << 2 | b2; + if (k == (bwtint_t)(-1)) + { + p = fmt->bwt + 4 + b1 * 4; + cnt[3] = p[b2]; + return; + } + k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) + p = fmt_occ_intv(fmt, k); + q = p + 4 + b1 * 4; + cnt[3] = q[b2]; // b2的occ + p += 20; + + // 使用mid interval信息 + int mk = k % FMT_OCC_INTERVAL; + int n_mintv = mk >> FMT_MID_INTV_SHIFT; + if (n_mintv > 0) // 至少超过了第一个mid interval + { + p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)) - 4; // 对应的mid interval check point的首地址,即A C G T的局部累积量 + q = p + b1; + x = *q; + cnt[3] += x >> (b2 << 3) & 0xff; // b2的occ + x = 0; + p += 4; + } + uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3)); + for (; p < end; ++p) + { + x += __fmt_occ_e2_aux2(fmt, ti, *p); + } + tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); + x += __fmt_occ_e2_aux2(fmt, ti, tmp); + if (b1 == 0) + { + x -= (~k & 7) << 8; + if (b2 == 0) + x -= (~k & 7) << 24; + } + // 如果跨过了second_primary,那么可能需要减掉一次累积值 + if (b1 == fmt->first_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary && b2 == fmt->last_base) + cnt[3] -= 1; + cnt[3] += x >> 24 & 0xff; +} + +inline void fmt_direct2_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) +{ + uint32_t x = 0; + uint32_t *p, *q, tmp; + bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); // cp = check point + int ti = b1 << 2 | b2; + cnt[1] = 0; + if (k == (bwtint_t)(-1)) + { + p = fmt->bwt + 4 + b1 * 4; + cnt[3] = p[b2]; + return; + } + k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) + p = fmt_occ_intv(fmt, k); + cnt[1] = p[b1]; // b1的occ + q = p + 4 + b1 * 4; + cnt[3] = q[b2]; // b2的occ + p += 20; + + // 使用mid interval信息 + int mk = k % FMT_OCC_INTERVAL; + int n_mintv = mk >> FMT_MID_INTV_SHIFT; + if (n_mintv > 0) // 至少超过了第一个mid interval + { + p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)) - 4; // 对应的mid interval check point的首地址,即A C G T的局部累积量 + q = p + b1; + x = *q; + cnt[1] += __fmt_mid_sum(x); // b1的occ + cnt[3] += x >> (b2 << 3) & 0xff; // b2的occ + x = 0; + p += 4; + } + uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3)); + for (; p < end; ++p) + { + x += __fmt_occ_e2_aux2(fmt, ti, *p); + } + tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); + x += __fmt_occ_e2_aux2(fmt, ti, tmp); + if (b1 == 0) + { + x -= (~k & 7) << 8; + if (b2 == 0) + x -= (~k & 7) << 24; + } + // 如果跨过了second_primary,那么可能需要减掉一次累积值 + if (b1 == fmt->first_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary && b2 == fmt->last_base) + cnt[3] -= 1; + cnt[1] += x >> 8 & 0xff; + cnt[3] += x >> 24 & 0xff; +} + // 扩展两个碱基 inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2) { @@ -434,6 +541,33 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti *ok2 = intv; } +// 扩展两个碱基 +inline void fmt_direct_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2) +{ + bwtint_t tk[4], tl[4]; + // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + fmt_direct_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk); + fmt_direct_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl); + ok2->x[!is_back] = fmt->L2[b2] + 1 + tk[3]; + ok2->x[2] = tl[3] - tk[3]; + ok2->x[is_back] = 0; +} + +// 扩展两个碱基 +inline void fmt_direct2_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2) +{ + bwtint_t tk[4], tl[4]; + // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + fmt_direct2_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk); + fmt_direct2_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl); + ok1->x[!is_back] = fmt->L2[b1] + 1 + tk[1]; + ok1->x[2] = tl[1] - tk[1]; + ok1->x[is_back] = 0; + ok2->x[!is_back] = fmt->L2[b2] + 1 + tk[3]; + ok2->x[2] = tl[3] - tk[3]; + ok2->x[is_back] = 0; +} + // 扩展一个碱基 inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1) { @@ -448,6 +582,17 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i // 第一次正向扩展 ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; } +inline void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1) +{ + bwtint_t tk[4], tl[4]; + int b2 = 3; // 如果只扩展一次,那么第二个碱基设置成T,可以减小一些计算量,如计算大于b2的累积数量 + // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + fmt_direct2_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk); + fmt_direct2_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl); + // 这里是反向扩展 + ok->x[!is_back] = fmt->L2[b1] + 1 + tk[1]; + ok->x[2] = tl[1] - tk[1]; +} // 序列和参考基因直接对比 static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt) @@ -457,7 +602,8 @@ static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int le while (k != qcond && r != rcond) \ { \ const int base = PAC_BASE(fmt->pac, r); \ - if (q[k] != base) break; \ + if (q[k] != base) \ + break; \ k += qstep; \ r += rstep; \ } @@ -465,7 +611,8 @@ static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int le while (k != qcond && r != rcond) \ { \ const int base = 3 - PAC_BASE(fmt->pac, r); \ - if (q[k] != base) break; \ + if (q[k] != base) \ + break; \ k += qstep; \ r += rstep; \ } @@ -510,7 +657,8 @@ static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int le static inline void fmt_reverse_intvs(bwtintv_v *p) { - if (p->n > 1) { + if (p->n > 1) + { int j; for (j = 0; j < p->n >> 1; ++j) { @@ -531,10 +679,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv uint32_t qbit = 0; mem->n = 0; int only_forward = x == 0 || q[x - 1] > 3; - if (q[x] > 3) return x + 1; + if (q[x] > 3) + return x + 1; - if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 - curr = tmpvec; // use the temporary vector if provided + if (min_intv < 1) + min_intv = 1; // the interval size should be at least 1 + curr = tmpvec; // use the temporary vector if provided qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置 @@ -542,13 +692,26 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv // check change of the interval size and whether the interval size is too small to be extended further #define CHECK_INTV_CHANGE(iv, ov, end_pos) \ - if (ov.x[2] != iv.x[2]) { kv_push(bwtintv_t, *curr, iv); if (ov.x[2] < min_intv) break; } iv = ov; iv.info = end_pos -#define PUSH_VAL_AND_SKIP(iv) \ - do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0) + if (ov.x[2] != iv.x[2]) \ + { \ + kv_push(bwtintv_t, *curr, iv); \ + if (ov.x[2] < min_intv) \ + break; \ + } \ + iv = ov; \ + iv.info = end_pos +#define PUSH_VAL_AND_SKIP(iv) \ + do \ + { \ + kv_push(bwtintv_t, *curr, iv); \ + goto backward_search; \ + } while (0) // 处理kmer对应的匹配信息 - if (only_forward) j = kmer_len - 1; - else j = 1; + if (only_forward) + j = kmer_len - 1; + else + j = 1; for (curr->n = 0; j < kmer_len; ++j) { bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j); @@ -558,8 +721,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv PUSH_VAL_AND_SKIP(ik); // 扩展kmer之后的碱基 -// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); -// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); for (i = (int)ik.info; i + 1 < len; i += 2) { // forward search if (q[i] < 4 && q[i + 1] < 4) @@ -569,18 +732,20 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_CHANGE(ik, ok1, i + 1); CHECK_INTV_CHANGE(ik, ok2, i + 2); - -#if 1 // 间隔为1的时候直接与reference比对 - if (min_intv == 1 && ok2.x[2] == min_intv) // 在这里进行判断是否只有一个候选了 + // 在这里进行判断是否只有一个候选了 +#if 1 + if (min_intv == 1 && ok2.x[2] == min_intv) { direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); kv_push(bwtintv_t, *mem, mt); ret = (uint32_t)mt.info; - if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; + if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; goto backward_search; } #endif - } else if (q[i] < 4) // q[i+1] >= 4 + } + else if (q[i] < 4) // q[i+1] >= 4 { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); CHECK_INTV_CHANGE(ik, ok1, i + 1); @@ -605,56 +770,72 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end backward_search: - fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first - if (mt.num_match == 0) ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 + fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first + if (mt.num_match == 0) + ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 - // 按照种子进行遍历,反向扩展 -#define CHECK_ADD_MEM(pos, intv, mem) \ - if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) { \ - (intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); \ + // 按照种子进行遍历,反向扩展 +#define CHECK_ADD_MEM(pos, intv, mem) \ + if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) \ + { \ + (intv).info |= (uint64_t)(pos) << 32; \ + kv_push(bwtintv_t, *mem, (intv)); \ } #define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ - if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; } + if (ok.x[2] < min_intv) \ + { \ + CHECK_ADD_MEM(pos, intv, mem); \ + break; \ + } int last_kmer_start = 0; for (j = 0; j < curr->n; ++j) { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 -// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); -// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); -#if 1 - if (!only_forward && p->info - x < HASH_KMER_LEN) { + // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); + if (!only_forward && p->info - x < HASH_KMER_LEN) + { if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer - else qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer + else + qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer last_kmer_start = p->info - 1; i = 1; - do { bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv); - if (i > 2) continue; - p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2]; + do + { + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); + } while (ik.x[2] < min_intv); + if (i > 2) + continue; + p->x[0] = ik.x[1]; + p->x[1] = ik.x[0]; + p->x[2] = ik.x[2]; i = p->info - (kmer_len - i + 3); - } else { + } + else + { i = x - 1; } -#else - i = x - 1; -#endif - for (; i > 0; i -= 2) + for (; i > 0; i -= 2) { if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 { - fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); + // fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); + fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); - ok2.info = p->info; *p = ok2; + ok2.info = p->info; + *p = ok2; } else if (q[i] < 4) // 只能扩展一个 { - fmt_extend1(fmt, p, &ok1, 1, q[i]); + // fmt_extend1(fmt, p, &ok1, 1, q[i]); + fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; CHECK_ADD_MEM(i, ok1, mem); @@ -668,16 +849,22 @@ backward_search: } for (; i == 0; --i) { // 扩展到了第一个碱基 - if (q[i] < 4) { - fmt_extend1(fmt, p, &ok1, 1, q[i]); + if (q[i] < 4) + { + // fmt_extend1(fmt, p, &ok1, 1, q[i]); + fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); - ok1.info = p->info; *p = ok1; - } else { + ok1.info = p->info; + *p = ok1; + } + else + { CHECK_ADD_MEM(i + 1, *p, mem); goto fmt_smem_end; } } - if (i == -1) { + if (i == -1) + { CHECK_ADD_MEM(i + 1, *p, mem); goto fmt_smem_end; } @@ -690,21 +877,21 @@ fmt_smem_end: int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) { - int i, kmer_len; - bwtintv_t ik = {0}, ok1={0}, ok2={0}; + int i, kmer_len, first_extend_len; + bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; uint64_t qbit; memset(mem, 0, sizeof(bwtintv_t)); - if (q[x] > 3) return x + 1; + if (q[x] > 3) + return x + 1; + if (len - x <= min_len) + return len; qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); - bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置 + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置 ik.info = x + kmer_len; - //fmt_set_intv(fmt, q[x], ik); - //ik.info = x + 1; - -// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); -// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); #define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \ if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \ @@ -713,12 +900,35 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in (ov).info = (uint64_t)start_pos << 32 | (end_pos + 1); \ return end_pos + 1; \ } +#if 1 + first_extend_len = x + min_len + 1; + first_extend_len = MIN(len, first_extend_len); + for (i = (int)ik.info; i + 1 < first_extend_len; i += 2) + { + if (q[i] < 4 && q[i + 1] < 4) + { + // fmt_direct_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); + fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); + ik = ok2; + } + else if (q[i] < 4) + return i + 2; + else + return i + 1; + } + COND_SET_RETURN(ik, *mem, x, i - 1, max_intv, min_len); + for (; i + 1 < len; i += 2) +#else for (i = (int)ik.info; i + 1 < len; i += 2) +#endif { // forward search if (q[i] < 4 && q[i + 1] < 4) { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); + // fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); @@ -736,7 +946,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in return i + 1; } } - if (i == len - 1) { + if (i == len - 1) + { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); } @@ -750,7 +961,7 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_ uint8_t base2; // 第一步,找到check point位置 p = fmt_occ_intv(fmt, k); // check point起始位置 - p += 20; // bwt碱基起始位置 + p += 20; // bwt碱基起始位置 // 第二步,找到mid check point位置 int mk = k & FMT_OCC_INTV_MASK; int n_mintv = mk >> FMT_MID_INTV_SHIFT; @@ -783,7 +994,9 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) { ++sa; fmt_previous_line(fmt, k, &k1, &k2); - if (!(k1 & mask)) { + __builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2); + if (!(k1 & mask)) + { k = k1; break; } diff --git a/fmt_idx.h b/fmt_idx.h index bc55b74..6774bc9 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -21,8 +21,6 @@ Date : 2023/12/24 #define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) -// #undef FMT_MID_INTERVAL - // 获取碱基c(待查找序列的首个碱基)和对应的互补碱基对应的行,以及间隔 #define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0) // k行(bwt str行(不包含$))对应的check point occ数据起始地址(小于k且是OCC_INTERVAL的整数倍) @@ -32,9 +30,11 @@ Date : 2023/12/24 #elif FMT_MID_INTERVAL == 16 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80)) #elif FMT_MID_INTERVAL == 32 -#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) +//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) +#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8) * 80) #elif FMT_MID_INTERVAL == 64 -#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) +//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) +#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6)) #elif FMT_MID_INTERVAL == 128 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24)) #else @@ -89,10 +89,15 @@ void fmt_restore_sa(const char *fn, FMTIndex *fmt); // 根据interval-bwt创建fmt-index FMTIndex *create_fmt_from_bwt(bwt_t *bwt); // 扩展两个个碱基,计算bwt base为b的pre-bwt str中各个碱基的occ +void fmt_direct_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]); +void fmt_direct2_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]); void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]); // 扩展两个碱基 +void fmt_direct2_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2); +void fmt_direct_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2); void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2); // 扩展一个碱基 +void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1); void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1); // 生成所有KMER_LEN长度的序列,字符串表示 void gen_all_seq(char **seq_arr, int kmer_len); diff --git a/ksw.c b/ksw.c index 1e584e9..21fb7e8 100644 --- a/ksw.c +++ b/ksw.c @@ -413,15 +413,24 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf) { eh_t *eh; // score array int8_t *qp; // query profile int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int mem_size = qlen * m + (qlen + 1) * 8; assert(h0 > 0); // allocate memory - qp = malloc(qlen * m); - eh = calloc(qlen + 1, 8); + if (mem_size > buf->m) { + buf->m = mem_size; + buf->addr = realloc(buf->addr, mem_size); + } + //qp = malloc(qlen * m); + //eh = calloc(qlen + 1, 8); + qp = (int8_t*)buf->addr; + eh = (eh_t*)(buf->addr + qlen * m); + memset(eh, 0, (qlen + 1) * 8); + // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -505,7 +514,7 @@ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, end = j + 2 < qlen? j + 2 : qlen; //beg = 0; end = qlen; // uncomment this line for debugging } - free(eh); free(qp); + // free(eh); free(qp); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; @@ -514,9 +523,12 @@ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, return max; } -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off, buf_t *buf) { - return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); + if (buf == 0) buf = calloc(1, sizeof(buf_t)); + return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off, buf); + free(buf->addr); + free(buf); } /******************** diff --git a/ksw.h b/ksw.h index f5b28cf..c14caa8 100644 --- a/ksw.h +++ b/ksw.h @@ -2,6 +2,7 @@ #define __AC_KSW_H #include +#include "utils.h" #define KSW_XBYTE 0x10000 #define KSW_XSTOP 0x20000 @@ -104,10 +105,10 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); - int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off, buf_t *buf); + int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off, buf_t *buf); int ksw_extend2_avx2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, - int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); + int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); #ifdef __cplusplus } diff --git a/ksw_extend2_avx2.c b/ksw_extend2_avx2.c index 7a0026b..6684129 100644 --- a/ksw_extend2_avx2.c +++ b/ksw_extend2_avx2.c @@ -1,11 +1,16 @@ #include #include #include -#include +#include #include #include #include +extern FILE *gfp1, *gfp2, *gfp3; +//#define DEBUG_SW_EXTEND 1 + +#define NO_VAL -1 + #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #define UNLIKELY(x) __builtin_expect((x),0) @@ -20,12 +25,13 @@ #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 16 +typedef struct { size_t m; uint8_t *addr; } buf_t; + extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, - int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); - -int ksw_extend2_origin(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, - int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); + int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); +int ksw_extend2_origin(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, + int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -152,6 +158,7 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { int pos = SIMD_WIDTH - 1 - (( __builtin_clz(mask)) >> 1); \ mj = j - 1 + pos; \ mi = i - 1 - pos; \ + /*if (m >= max) fprintf(stderr, "%d %d %d %d %d %d %d\n", iend, beg, mi, mj, mask, pos, m);*/ \ } \ } \ } @@ -186,19 +193,18 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 int *_gtle, // query全部匹配上的target的长度 int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 + int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值 + buf_t *buf) // 之前已经开辟过的缓存 { //return ksw_extend2_origin(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off); -// fprintf(stderr, "qlen: %d, tlen: %d\n", qlen, tlen); - if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off); - + if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf); int16_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M int16_t *seq, *ref; uint8_t *mem; int16_t *qtmem, *vmem; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int Dloop = tlen + qlen; // 循环跳出条件 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; @@ -210,8 +216,15 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 assert(h0 > 0); // allocate memory - mem = malloc(mem_size); - qtmem = (int16_t*)&mem[0]; + //mem = malloc(mem_size); + + if (buf->m < mem_size) { + buf->m = mem_size; + buf->addr = realloc(buf->addr, mem_size); + } + mem = buf->addr; + + qtmem = (int16_t *)&mem[0]; seq=&qtmem[0]; ref=&qtmem[seq_size]; if (is_left) { for (i=0; i 0) + marr[mi] = MAX(marr[mi], hA2[midx - mi]); + } + midx += 1; + if (ibeg > icheck) + { + int stopCalc = 0; + for (mi = 0; mi < b - 1; ++mi) + { + stopCalc |= !marr[mi]; + } + if (stopCalc) + break; + else + checkspecial = 0; + } + } +#else + if (hA1[0] < 4 && checkspecial) { // b == 4 + if (hA1[0] == 3) { + icheck = iend + 1; + } else if (midx == 2) { + m2 = MAX(m2, hA2[midx - 1]); + } else { + m2 = MAX(m2, hA2[midx - 1]); + m1 = MAX(m1, hA2[midx - 2]); + } + m3 = MAX(m3, hA2[midx]); + midx += 1; + if (ibeg > icheck) + { + if (!m1 || !m2 || !m3) + break; + else + checkspecial = 0; + } + + //if (print_flag) { + //fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA2[midx + 1], hA2[midx + 2], hA2[midx + 3], midx); + //if (midx > 2) fprintf(stderr, "%d, %d, %d\n", hA2[midx-1], hA2[midx-2], hA2[midx-3]); + //fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, hA1: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA1[0], m1, m2, m3, midx); + //} + } +#endif +#endif + +#ifdef DEBUG_SW_EXTEND + for (djj = beg; djj <= end; ++djj) + { + dii = D - djj + 1; + ins[dii][djj] = fA2[djj]; + del[dii][djj] = eA2[djj]; + score[dii][djj] = hA2[djj]; + } + //if (print_flag) + //{ + //fprintf(stderr, "score: %d %d %d\n", hA2[beg], hA2[beg+1], hA2[beg+2]); + //} +#endif + // 注意最后跳出循环j的值 j = end + 1; if (j == qlen + 1) { - max_ie = gscore > hA2[qlen] ? max_ie : iStart; + max_ie = gscore > hA2[qlen] ? max_ie : ibeg; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点 + //if (m == 0 && m_last < 2) break; if (m > max) { max = m, max_i = mi, max_j = mj; - max_off = max_off > abs(mj - mi)? max_off : abs(mj - mi); + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } else if (m == max && max_i >= mi && mj > max_j) { + max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); } else if (zdrop > 0) { if (mi - max_i > mj - max_j) { @@ -347,7 +472,49 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 SWAP_DATA_POINTER; } - free(mem); +#ifdef DEBUG_SW_EXTEND + fprintf(gfp1, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gfp2, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gfp3, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); +#endif +#ifdef DEBUG_SW_EXTEND + fprintf(gfp1, "%-4d", -1); + fprintf(gfp2, "%-4d", -1); + fprintf(gfp3, "%-4d", -1); + fprintf(gfp1, "%-4d", -1); + fprintf(gfp2, "%-4d", -1); + fprintf(gfp3, "%-4d", -1); + for (djj = 0; djj < qlen; ++djj) { + fprintf(gfp1, "%-4c", "ACGTN"[query[djj]]); + fprintf(gfp2, "%-4c", "ACGTN"[query[djj]]); + fprintf(gfp3, "%-4c", "ACGTN"[query[djj]]); + } + fprintf(gfp1, "\n"); + fprintf(gfp2, "\n"); + fprintf(gfp3, "\n"); + for (dii = 0; dii <= tlen; ++dii) + { + if (dii > 0) { + fprintf(gfp1, "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gfp2, "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gfp3, "%-4c", "ACGTN"[target[dii - 1]]); + } else { + fprintf(gfp1, "%-4d", -1); + fprintf(gfp2, "%-4d", -1); + fprintf(gfp3, "%-4d", -1); + } + for (djj = 0; djj <= qlen; ++djj) + { + fprintf(gfp1, "%-4d", score[dii][djj]); + fprintf(gfp2, "%-4d", ins[dii][djj]); + fprintf(gfp3, "%-4d", del[dii][djj]); + } + fprintf(gfp1, "\n"); + fprintf(gfp2, "\n"); + fprintf(gfp3, "\n"); + } +#endif + // free(mem); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; @@ -417,12 +584,39 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); max_del = max_del > 1? max_del : 1; w = w < max_del? w : max_del; // TODO: is this necessary? -// printf("%d\n", w); + //fprintf(stderr, "%d\n", w); // DP loop max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; max_off = 0; beg = 0, end = qlen; + //int print_flag = 0; //(qlen == 116 && tlen == 241); + //fprintf(stderr, "%d %d %d\n", print_flag, qlen, tlen); +#ifdef DEBUG_SW_EXTEND + int dii, djj; + int16_t ins[tlen + 1][qlen + 2]; + int16_t del[tlen + 1][qlen + 2]; + int16_t score[tlen + 1][qlen + 2]; + for (dii = 0; dii <= tlen; ++dii) + { + for (djj = 0; djj <= qlen; ++djj) + { + ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL; + } + } + for (dii = 1; dii <= tlen; ++dii) + { + del[dii][0] = MAX(0, h0 - o_del - e_del * dii); + score[dii][0] = del[dii][0]; + } + for (djj = 1; djj <= qlen; ++djj) + { + ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj); + score[0][djj] = ins[0][djj]; + } + ins[0][0] = del[0][0] = score[0][0] = h0; +#endif + for (i = 0; LIKELY(i < tlen); ++i) { int t, f = 0, h1, m = 0, mj = -1; int8_t *q = &qp[ref[i] * qlen]; @@ -435,7 +629,11 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 h1 = h0 - (o_del + e_del * (i + 1)); if (h1 < 0) h1 = 0; } else h1 = 0; + //m = h1; for (j = beg; LIKELY(j < end); ++j) { +#ifdef DEBUG_SW_EXTEND + ins[i+1][j+1] = f; +#endif // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Similar to SSE2-SW, cells are computed in the following order: // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} @@ -444,16 +642,22 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 eh_t *p = &eh[j]; int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) p->h = h1; // set H(i,j-1) for the next row - M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M" + M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0,sw和nw的区别 h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column +#ifdef DEBUG_SW_EXTEND + score[i+1][j+1] = h; +#endif mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] t = M - oe_del; t = t > 0? t : 0; e -= e_del; e = e > t? e : t; // computed E(i+1,j) +#ifdef DEBUG_SW_EXTEND + del[i+1][j+1] = e; +#endif p->e = e; // save E(i+1,j) for the next row t = M - oe_ins; t = t > 0? t : 0; @@ -469,6 +673,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + //fprintf(stderr, "%d %d %d %d\n", i, mj, max_off, m); } else if (zdrop > 0) { if (i - max_i > mj - max_j) { if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; @@ -477,12 +682,61 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 } } // update beg and end for the next round - for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); + for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑f(insert score) beg = j; for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); end = j + 2 < qlen? j + 2 : qlen; //beg = 0; end = qlen; // uncomment this line for debugging + //if (print_flag) { + // fprintf(stderr, "beg: %d; end: %d\n", beg, end); + //} } +#ifdef DEBUG_SW_EXTEND + fprintf(gfp1, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gfp2, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gfp3, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); +#endif +#ifdef DEBUG_SW_EXTEND + fprintf(gfp1, "%-4d", -1); + fprintf(gfp2, "%-4d", -1); + fprintf(gfp3, "%-4d", -1); + fprintf(gfp1, "%-4d", -1); + fprintf(gfp2, "%-4d", -1); + fprintf(gfp3, "%-4d", -1); + for (djj = 0; djj < qlen; ++djj) + { + fprintf(gfp1, "%-4c", "ACGTN"[query[djj]]); + fprintf(gfp2, "%-4c", "ACGTN"[query[djj]]); + fprintf(gfp3, "%-4c", "ACGTN"[query[djj]]); + } + fprintf(gfp1, "\n"); + fprintf(gfp2, "\n"); + fprintf(gfp3, "\n"); + for (dii = 0; dii <= tlen; ++dii) + { + if (dii > 0) + { + fprintf(gfp1, "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gfp2, "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gfp3, "%-4c", "ACGTN"[target[dii - 1]]); + } + else + { + fprintf(gfp1, "%-4d", -1); + fprintf(gfp2, "%-4d", -1); + fprintf(gfp3, "%-4d", -1); + } + for (djj = 0; djj <= qlen; ++djj) + { + fprintf(gfp1, "%-4d", score[dii][djj]); + fprintf(gfp2, "%-4d", ins[dii][djj]); + fprintf(gfp3, "%-4d", del[dii][djj]); + } + fprintf(gfp1, "\n"); + fprintf(gfp2, "\n"); + fprintf(gfp3, "\n"); + } +#endif free(eh); free(qp); free(qmem); if (_qle) *_qle = max_j + 1; diff --git a/ksw_extend2_avx2_u8.c b/ksw_extend2_avx2_u8.c index 9311c19..5aada2a 100644 --- a/ksw_extend2_avx2_u8.c +++ b/ksw_extend2_avx2_u8.c @@ -20,6 +20,8 @@ #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 32 +typedef struct { size_t m; uint8_t *addr; } buf_t; + static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -204,13 +206,14 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 int *_gtle, // query全部匹配上的target的长度 int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 + int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值 + buf_t *buf) // 之前已经开辟过的缓存 { uint8_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M uint8_t *seq, *ref; uint8_t *mem, *qtmem, *vmem; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int Dloop = tlen + qlen; // 循环跳出条件 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; @@ -222,7 +225,13 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 assert(h0 > 0); // allocate memory - mem = malloc(mem_size); + //mem = malloc(mem_size); + if (buf->m < mem_size) { + buf->m = mem_size; + buf->addr = realloc(buf->addr, mem_size); + } + mem = buf->addr; + qtmem = &mem[0]; seq=(uint8_t*)&qtmem[0]; ref=(uint8_t*)&qtmem[seq_size]; if (is_left) { @@ -272,7 +281,13 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 int m_last=0; int iend; - +#ifdef KSW_EQUAL + int midx = 1, icheck = 0, checkspecial = 1; + int m3 = 0, m2 = 0, m1 = 0; + //int marr[10] = {0}; + // int marr[b]; + // memset(marr, 0, 4 * b); +#endif for (D = 1; LIKELY(D < Dloop); ++D) { // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 if (D > tlen) { @@ -290,7 +305,7 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 iend = D - (beg - 1); // ref开始计算的位置,倒序 span = end - beg; - iStart = iend - span - 1; // 0开始的ref索引位置 + ibeg = iend - span - 1; // 0开始的ref索引位置 // 每一轮需要记录的数据 int m = 0, mj = -1, mi = -1; @@ -298,10 +313,13 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 // 要处理边界 // 左边界 处理f (insert) - if (iStart == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); } + if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); } // 上边界 if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); } - else { hA1[beg-1] = 0; eA1[beg-1] = 0; } + else if (D & 1) { + hA1[beg - 1] = 0; + hA2[beg - 1] = 0; + } for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { // 取数据 @@ -329,11 +347,65 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 SIMD_FIND_MAX; +#ifdef KSW_EQUAL +#if 0 + if (hA1[0] < b && checkspecial) { + int mi; + if (hA1[0] == b - 1) { + icheck = iend + 1; + } + for (mi = 0; mi < b - 1; ++mi) { + if (midx - mi > 0) + marr[mi] = MAX(marr[mi], hA2[midx - mi]); + } + midx += 1; + if (ibeg > icheck) + { + int stopCalc = 0; + for (mi = 0; mi < b - 1; ++mi) + { + stopCalc |= !marr[mi]; + } + if (stopCalc) + break; + else + checkspecial = 0; + } + } +#else + if (hA1[0] < 4 && checkspecial) + { // b == 4 + if (hA1[0] == 3) + { + icheck = iend + 1; + } + else if (midx == 2) + { + m2 = MAX(m2, hA2[midx - 1]); + } + else + { + m2 = MAX(m2, hA2[midx - 1]); + m1 = MAX(m1, hA2[midx - 2]); + } + m3 = MAX(m3, hA2[midx]); + midx += 1; + if (ibeg > icheck) + { + if (!m1 || !m2 || !m3) + break; + else + checkspecial = 0; + } + } +#endif +#endif + // 注意最后跳出循环j的值 j = end + 1; if (j == qlen + 1) { - max_ie = gscore > hA2[qlen] ? max_ie : iStart; + max_ie = gscore > hA2[qlen] ? max_ie : ibeg; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点 @@ -341,6 +413,10 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 max = m, max_i = mi, max_j = mj; max_off = max_off > abs(mj - mi)? max_off : abs(mj - mi); } + else if (m == max && max_i >= mi && mj > max_j) { + max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } else if (zdrop > 0) { if (mi - max_i > mj - max_j) { if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; @@ -360,7 +436,7 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 SWAP_DATA_POINTER; } - free(mem); +// free(mem); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; diff --git a/out.sam b/out.sam deleted file mode 100644 index 7a61bcc..0000000 --- a/out.sam +++ /dev/null @@ -1,108 +0,0 @@ -@SQ SN:1 LN:249250621 -@SQ SN:2 LN:243199373 -@SQ SN:3 LN:198022430 -@SQ SN:4 LN:191154276 -@SQ SN:5 LN:180915260 -@SQ SN:6 LN:171115067 -@SQ SN:7 LN:159138663 -@SQ SN:8 LN:146364022 -@SQ SN:9 LN:141213431 -@SQ SN:10 LN:135534747 -@SQ SN:11 LN:135006516 -@SQ SN:12 LN:133851895 -@SQ SN:13 LN:115169878 -@SQ SN:14 LN:107349540 -@SQ SN:15 LN:102531392 -@SQ SN:16 LN:90354753 -@SQ SN:17 LN:81195210 -@SQ SN:18 LN:78077248 -@SQ SN:19 LN:59128983 -@SQ SN:20 LN:63025520 -@SQ SN:21 LN:48129895 -@SQ SN:22 LN:51304566 -@SQ SN:X LN:155270560 -@SQ SN:Y LN:59373566 -@SQ SN:MT LN:16569 -@SQ SN:GL000207.1 LN:4262 -@SQ SN:GL000226.1 LN:15008 -@SQ SN:GL000229.1 LN:19913 -@SQ SN:GL000231.1 LN:27386 -@SQ SN:GL000210.1 LN:27682 -@SQ SN:GL000239.1 LN:33824 -@SQ SN:GL000235.1 LN:34474 -@SQ SN:GL000201.1 LN:36148 -@SQ SN:GL000247.1 LN:36422 -@SQ SN:GL000245.1 LN:36651 -@SQ SN:GL000197.1 LN:37175 -@SQ SN:GL000203.1 LN:37498 -@SQ SN:GL000246.1 LN:38154 -@SQ SN:GL000249.1 LN:38502 -@SQ SN:GL000196.1 LN:38914 -@SQ SN:GL000248.1 LN:39786 -@SQ SN:GL000244.1 LN:39929 -@SQ SN:GL000238.1 LN:39939 -@SQ SN:GL000202.1 LN:40103 -@SQ SN:GL000234.1 LN:40531 -@SQ SN:GL000232.1 LN:40652 -@SQ SN:GL000206.1 LN:41001 -@SQ SN:GL000240.1 LN:41933 -@SQ SN:GL000236.1 LN:41934 -@SQ SN:GL000241.1 LN:42152 -@SQ SN:GL000243.1 LN:43341 -@SQ SN:GL000242.1 LN:43523 -@SQ SN:GL000230.1 LN:43691 -@SQ SN:GL000237.1 LN:45867 -@SQ SN:GL000233.1 LN:45941 -@SQ SN:GL000204.1 LN:81310 -@SQ SN:GL000198.1 LN:90085 -@SQ SN:GL000208.1 LN:92689 -@SQ SN:GL000191.1 LN:106433 -@SQ SN:GL000227.1 LN:128374 -@SQ SN:GL000228.1 LN:129120 -@SQ SN:GL000214.1 LN:137718 -@SQ SN:GL000221.1 LN:155397 -@SQ SN:GL000209.1 LN:159169 -@SQ SN:GL000218.1 LN:161147 -@SQ SN:GL000220.1 LN:161802 -@SQ SN:GL000213.1 LN:164239 -@SQ SN:GL000211.1 LN:166566 -@SQ SN:GL000199.1 LN:169874 -@SQ SN:GL000217.1 LN:172149 -@SQ SN:GL000216.1 LN:172294 -@SQ SN:GL000215.1 LN:172545 -@SQ SN:GL000205.1 LN:174588 -@SQ SN:GL000219.1 LN:179198 -@SQ SN:GL000224.1 LN:179693 -@SQ SN:GL000223.1 LN:180455 -@SQ SN:GL000195.1 LN:182896 -@SQ SN:GL000212.1 LN:186858 -@SQ SN:GL000222.1 LN:186861 -@SQ SN:GL000200.1 LN:187035 -@SQ SN:GL000193.1 LN:189789 -@SQ SN:GL000194.1 LN:191469 -@SQ SN:GL000225.1 LN:211173 -@SQ SN:GL000192.1 LN:547496 -@SQ SN:NC_007605 LN:171823 -@SQ SN:hs37d5 LN:35477943 -@HD VN:1.5 SO:unsorted GO:query -@RG ID:normal SM:normal PL:illumina LB:normal PG:bwa -@PG ID:bwa PN:bwa VN:0.7.17-r1198-dirty CL:./bwa mem -t 1 -M -R @RG\tID:normal\tSM:normal\tPL:illumina\tLB:normal\tPG:bwa /home/zzh/reference/human_g1k_v37_decoy.fasta /home/zzh/fastq/diff_r1.fq /home/zzh/fastq/diff_r2.fq -o ./out.sam -A00289:748:HLCH3DSX3:4:1102:24749:34929 97 16 75642166 60 150M 12 66451372 0 AAAGCCATTGGCAGTAACATCCAAAGGCTGCTCAGGAACTGCACTCCAGCTGATGGCTATGAAAAGATAAGATTCTAGGGTTAAAACTGCTCAATTTCTTGTGAAAGTCAGAAGTATTTTAATAAAAATATTTCTCTGACTCTACAGACC FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF: NM:i:0 MD:Z:150 MC:Z:17S91M42S AS:i:150 XS:i:0 RG:Z:normal -A00289:748:HLCH3DSX3:4:1102:24749:34929 145 12 66451372 0 17S91M42S 16 75642166 0 TTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATAAATAATTCTTTGTGAAACCCCT F,F,FFF,:FFF,,F,F,,F,F:,FFF:F:FFF:FFFFFFF:F,F,F,FFFFFF::,,FFFFFFF,:F,:,F,::,,F:FF:,FF:F::F::FFFFFFFFFF,F:F,FFFF,FFFFF,FF,F,,F,F,,,,,,:,,,,,,,,,,,,,,,F NM:i:0 MD:Z:91 MC:Z:150M AS:i:91 XS:i:90 RG:Z:normal SA:Z:12,49106363,+,117S33M,0,0; XA:Z:12,-66451373,35S90M25S,0;12,-66451373,28S90M32S,0;6,-160521757,23S83M44S,0;15,-77910871,23S79M48S,0; -A00289:748:HLCH3DSX3:4:1102:24749:34929 385 12 49106363 0 117H33M 16 75642166 0 AAAAAAAAAAAAAAATAAAAAAAAAAAAAAAAA FFF:F:FFF,:F,F,,F,F,,FFF:,FFF,F,F NM:i:0 MD:Z:33 MC:Z:150M AS:i:33 XS:i:32 RG:Z:normal SA:Z:12,66451372,-,17S91M42S,0,0; XA:Z:4,-19079503,32M118S,0;15,-55218248,32M118S,0;X,-149551159,32M118S,0; -A00289:748:HLCH3DSX3:4:1104:6234:8656 113 12 66451373 8 12S91M47S 22 18873826 0 TTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTTGGGGGGGGGGGGGGGTGAGGTGGTCTGTG ,,:FFF:,:,::,,,,,,FF,FFF,,FF,F,FFF:,,:FFFFFF:FFF,:,:FF:F:::::FFF::,::F:FFFFFF:,FFFF,,,:FFFFFFFF:,,::,,,,,:,,FFF:,,,FF:F,:,FFFF:F::,,FF:,:FFFFFF:FFFFFF NM:i:1 MD:Z:13T77 MC:Z:150M AS:i:86 XS:i:81 RG:Z:normal SA:Z:3,121668661,-,89S32M29S,0,0; XA:Z:12,-66451372,25S91M34S,2;6,-160521757,19S84M47S,1;15,-77910874,26S77M47S,0;7,-107410599,25M1I3M15D74M47S,16; -A00289:748:HLCH3DSX3:4:1104:6234:8656 369 3 121668661 0 89H32M29H 22 18873826 0 TTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTT FFFFFF:,,::,,,,,:,,FFF:,,,FF:F,: NM:i:0 MD:Z:32 MC:Z:150M AS:i:32 XS:i:32 RG:Z:normal SA:Z:12,66451373,-,12S91M47S,8,1; -A00289:748:HLCH3DSX3:4:1104:6234:8656 177 22 18873826 22 150M 12 66451373 0 CTATACCCCACATCCTCATTCCCACCAGATGTCTTGGATCATGGAGGCTCTCCAGACAAAAGCCAGCAGTTAAGCTCCAGATTTCCTATAGAATCCTTTTCTAACAACCAGTGAGTGATTCCAGAATACGTACCATTGAATGTGCTCCCT FFF:FFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S91M47S AS:i:150 XS:i:140 RG:Z:normal -A00289:748:HLCH3DSX3:4:1104:1741:18129 97 11 113220686 60 150M 12 66451373 0 TATTTCACCCGGATGCTGACTCCTTTTTTGGTTTCCTTGCAAGGTTACCCTGTTGGGATGTTGTTAGTTGGAGGACAGCTACCTCTGAGGTTATTATGGTTTTTTGCAGATTATTGGAAGCGCTGGTGTCATTTCTTGATTTCTCGGATA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFF:FFFFFFF:FFFF NM:i:0 MD:Z:150 MC:Z:90M60S AS:i:150 XS:i:19 RG:Z:normal -A00289:748:HLCH3DSX3:4:1104:1741:18129 145 12 66451373 7 90M60S 11 113220686 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTCCCAC FF:FFFF,F,,FFFFFFFF:FFFF:,F,,:F:F::::,F::::FFF::F::,,:FFF,:::FFFFFFFFFFF:FFFFFFFF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFFFF:F:,,,,,,, NM:i:1 MD:Z:81T8 MC:Z:150M AS:i:85 XS:i:81 RG:Z:normal SA:Z:8,112168232,-,79S59M12S,0,1; XA:Z:6,-160521757,81M69S,0;4,-82515382,13S69M68S,0; -A00289:748:HLCH3DSX3:4:1104:1741:18129 401 8 112168232 0 79H59M12H 11 113220686 0 TTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTT FF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFF NM:i:1 MD:Z:46T12 MC:Z:150M AS:i:54 XS:i:53 RG:Z:normal SA:Z:12,66451373,-,90M60S,7,1; -A00289:748:HLCH3DSX3:4:1105:27281:29700 113 X 117117442 60 150M 12 66451372 0 CCCTGAAGTTGATTTTTTGAATATATCCTTGAAATCGGCTTGTTTTAATGACAATTGCCTCCGTCTCCTGAAAGTGGAGTGCAAAAAGAAACCTGGTTAATTGTAGTTTTATTATATTTAGTTCAAATTTTTCAAAGCATACCCTGACAG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S89M49S AS:i:150 XS:i:0 RG:Z:normal -A00289:748:HLCH3DSX3:4:1105:27281:29700 177 12 66451372 0 12S89M49S X 117117442 0 TTTTTTAAAAATATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTATTTATTTTTTTATTTTTTTATTATTTTTTTTTTTTTTTTTTTTTTTTTGGTGCCGCCCCC ,,,,:,,F:F,,,,:FF,F:FF,::F:F:F,,F::FFF,FFFFFFFFF,FFFFFF::FFF:F:FFFFFFF,FFFFFFF::FFFF:,:FF:,,F,:,FFFFF,:,,:F,,,,:,FFFFF:F,,FFF:FFFFF:,F,F,,,:,,,,,:FFFF NM:i:3 MD:Z:73T3T3T7 MC:Z:150M AS:i:74 XS:i:72 RG:Z:normal SA:Z:15,85942734,+,20S48M82S,0,1; -A00289:748:HLCH3DSX3:4:1105:27281:29700 417 15 85942734 0 20H48M82H X 117117442 0 AAAAAAAAAAAAAAAAATAATAAAAAAATAAAAAAATAAATAAATAAA FFFF:FFF,,F:FFFFF,:,,,,F:,,:,FFFFF,:,F,,:FF:,:FF NM:i:1 MD:Z:17A30 MC:Z:150M AS:i:43 XS:i:36 RG:Z:normal SA:Z:12,66451372,-,12S89M49S,0,3; XA:Z:11,-118049564,69S35M7D34M12S,11;2,+204499190,12S31M1D21M86S,3; -A00289:748:HLCH3DSX3:4:1106:2880:19288 97 4 169630630 0 29M1I4M1I68M9D22M25S 12 66451373 0 CAGCCTGGGTGACAGAGTGAGACTCCAGCTAAGGCAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAATGAAGGAAGGAAGGAGGGAGGGAGGGAGGGAGGGAGAGATCGGAAGAGCACACGTCTGAAC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFF:FFF:FFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFF NM:i:14 MD:Z:18C8T59G13^AAGGAAGGA22 MC:Z:36S91M23S AS:i:73 XS:i:72 RG:Z:normal -A00289:748:HLCH3DSX3:4:1106:2880:19288 145 12 66451373 9 36S91M23S 4 169630630 0 TATATACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTGGGTGGGTGGGTGGGTTGGCAG :,,,,,,,:FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:FFFFFFFFFFFFFFFFFFFF:FF:FFFF:FFFFFFF:FFF:FFFFFFF:FF,FFF:,:::,:F::,,:,,:,:,,,,,:,::,,,:F,FF NM:i:1 MD:Z:10T80 MC:Z:29M1I4M1I68M9D22M25S AS:i:86 XS:i:80 RG:Z:normal SA:Z:15,55218228,-,9S52M89S,0,0; XA:Z:15,-77910871,47S80M23S,0;6,-160521761,47S80M23S,0;6,-160521757,47S79M24S,0;7,-107410599,25S28M15D74M23S,16; -A00289:748:HLCH3DSX3:4:1106:2880:19288 401 15 55218228 0 9H52M89H 4 169630630 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTT FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:F NM:i:0 MD:Z:52 MC:Z:29M1I4M1I68M9D22M25H AS:i:52 XS:i:50 RG:Z:normal SA:Z:12,66451373,-,36S91M23S,9,1; -A00289:748:HLCH3DSX3:4:1107:2365:20462 97 22 18737670 0 150M 12 66451372 0 GAGGACACCATACAAGCCGTCGCTAACAAGGACACTGTACACAACATCGCTAATGACGGCACCGTACAAGACATCACCAATGAGGGCGCTTTATACGACATTGCTAATGATACCGACAAGGCACGCTAACGTGGACGCTGTACACGACAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:21S90M39S AS:i:150 XS:i:150 RG:Z:normal -A00289:748:HLCH3DSX3:4:1107:2365:20462 145 12 66451372 13 21S90M39S 22 18737670 0 TTTTATATATTTATATTTATTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTGTGGTGGGGGGGGGGTTTGGCATAGCAT F:F,,F,F,F:FFF,,,:,F:,:,,:,,,,:,F:,,FFFFFFFFFFFFF,F:FFFFFFFFF:FFFF,,FFFFF:FFF::FFFFF,F:F,,F:F,FFFFFFFF,:F:F::::,,:F,F,,F,:,,,,,::,,,,:,,:,,,,,,F,F::FF NM:i:0 MD:Z:90 MC:Z:150M AS:i:90 XS:i:84 RG:Z:normal XA:Z:6,-160521757,28S84M38S,0;15,-77910871,32S80M38S,0;7,-107410642,38S74M38S,0; -A00289:748:HLCH3DSX3:4:1107:26377:28385 113 GL000220.1 122227 60 150M 6 160521757 0 TGCTCTGTTGCTCACGCTGGTCTCAAACTCCTGGCCTTGACGCTTCTCCCGTCACATCCGCCGTCTGGTTGTTGAAATGAGCATCTCTCGTAAAATGGAAAAGATGAAAGAAATAAACACGAAGACGGAAAGCACGGTGTGAACGTTTCT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:85M65S AS:i:150 XS:i:113 RG:Z:normal -A00289:748:HLCH3DSX3:4:1107:26377:28385 177 6 160521757 0 85M65S GL000220.1 122227 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTGTTTTTTTTTTGTTGGGGGTTGGGTGTTGGGGGGCGGGGGGGGGGGCGCCCCCCCCCAAGGAGA F:FF,F,,F,F,FFFFFF,:FF,:FF:F,:FF,FFFF:FFF:FF,FFFFFFFFFFFFFFFFFFFFFFFF:FF,F,F:F,:FF,,,FFFF,F,:,,,,,:::,,:,,,,::,,,,,,,,,F,,FF,:F:,,F,,,,,FF:FF:F,FFFF:: NM:i:0 MD:Z:85 MC:Z:150M AS:i:85 XS:i:84 RG:Z:normal XA:Z:12,-66451380,84M66S,0;12,-66451373,83M67S,0;15,-77910871,4S80M66S,0; diff --git a/pemerge.c b/pemerge.c index 725885f..197e87f 100644 --- a/pemerge.c +++ b/pemerge.c @@ -216,9 +216,9 @@ static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cn int main_pemerge(int argc, char *argv[]) { - int c, flag = 0, i, n, min_ovlp = 10; - int64_t cnt[MAX_ERR+1]; - bseq1_t *bseq; + int c, flag = 0, i, n, m = 0, min_ovlp = 10; + int64_t cnt[MAX_ERR + 1], seq_size = 0; + bseq1_t *bseq = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; pem_opt_t *opt; @@ -269,10 +269,12 @@ int main_pemerge(int argc, char *argv[]) } memset(cnt, 0, 8 * (MAX_ERR+1)); - while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq); + while (n > 0) { process_seqs(opt, n, bseq, cnt); - free(bseq); + bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq); } + if (bseq != 0) free(bseq); fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); for (i = 1; i <= MAX_ERR; ++i) diff --git a/run.sh b/run.sh index c8da9d1..8adaa60 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,14 @@ -thread=1 -n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq -n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq +thread=32 +n_r1=~/fastq/ZY2105177532213010_L4_1.fq +n_r2=~/fastq/ZY2105177532213010_L4_2.fq +#n_r1=~/fastq/na12878/nas_1.fq +#n_r2=~/fastq/na12878/nas_2.fq +#n_r1=~/fastq/na12878/na_1.fq +#n_r2=~/fastq/na12878/na_2.fq +#n_r1=~/fastq/n_r1.fq +#n_r2=~/fastq/n_r2.fq +#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq +#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq #n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq #n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq #reference=~/data/reference/human_g1k_v37_decoy.fasta @@ -8,6 +16,8 @@ n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq #n_r2=~/fastq/sn_r2.fq #n_r1=~/fastq/ssn_r1.fq #n_r2=~/fastq/ssn_r2.fq +#n_r1=~/fastq/ERR194147_1_120w.fastq +#n_r2=~/fastq/ERR194147_2_120w.fastq #n_r1=~/fastq/tiny_n_r1.fq #n_r2=~/fastq/tiny_n_r2.fq #n_r1=~/fastq/diff_r1.fq @@ -15,9 +25,12 @@ n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq #n_r1=~/fastq/d_r1.fq #n_r2=~/fastq/d_r2.fq reference=~/reference/human_g1k_v37_decoy.fasta -#out=./ssn.sam +#out=./all.sam +#out=./sn.sam +#out=./ssn-x1.sam #out=./out.sam out=/dev/null +#out=./na12878.sam #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ # /home/zzh/data/fastq/nm1.fq \ @@ -29,7 +42,7 @@ out=/dev/null # /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \ # -o /dev/null -time ./bwa mem -b 64 -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ $reference \ $n_r1 \ $n_r2 \ diff --git a/utils.h b/utils.h index b54ffe0..764cc45 100644 --- a/utils.h +++ b/utils.h @@ -46,12 +46,14 @@ extern int64_t time_ksw_extend2, time_bwt_sa, time_bwt_sa_read, time_bns, - time_core_process; + time_work1, + time_work2, + time_flt_chain; extern int64_t dn, n16, n17, n18, n19, nall, num_sa; extern int64_t s1n, s2n, s3n, s1l, s2l, s3l; extern int64_t g_num_smem2; -extern FILE *fp1; +extern FILE *gfp1, *gfp2, *gfp3; #endif @@ -83,6 +85,8 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +typedef struct { size_t m; uint8_t *addr; } buf_t; + #ifdef __cplusplus extern "C" { #endif