改成了batch模式，对na12878有效果

2024-03-09 11:39:40 +08:00 · 2024-03-09 11:39:40 +08:00 · 856a0e0c01
parent 7d085962a2
commit 856a0e0c01
16 changed files with 792 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.[oa]
 *.txt
 *.sam
+*.log
 bwa
 test
 test64
--- a/2
+++ b/2
@ -3,7 +3,7 @@ CC=			gcc
 # CFLAGS=		-g -Wall -Wno-unused-function -O2
 CFLAGS=		-g -Wall -Wno-unused-function -mavx2 -O2
 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
-SHOW_PERF=  #-DSHOW_PERF
+SHOW_PERF=  -DSHOW_PERF
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) -DUSE_AVX2 -DKSW_EQUAL #-DUSE_MT_READ #-DFILTER_FULL_MATCH 
 LOBJS=		utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
--- a/bwa.sh
+++ b/bwa.sh
@ -0,0 +1,12 @@
+thread=32
+n_r1=~/fastq/na12878/na12878_r1.fq
+n_r2=~/fastq/na12878/na12878_r2.fq
+reference=~/reference/human_g1k_v37_decoy.fasta
+out=/dev/null
+time /home/zzh/work/bwa/bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
+    $reference \
+    $n_r1 \
+    $n_r2 \
+    -o $out
+
+
--- a/bwamem.c
+++ b/bwamem.c
@ -149,8 +149,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len,
 		}
 	}
 #ifdef SHOW_PERF
-	tmp_time = realtime_msec() - tmp_time;
-	__sync_fetch_and_add(&time_seed_1, tmp_time);
+	a->time_seed_1 += realtime_msec() - tmp_time;
 	tmp_time = realtime_msec();
 #endif

@ -174,8 +173,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len,
 		}
 	}
 #ifdef SHOW_PERF
-	tmp_time = realtime_msec() - tmp_time;
-	__sync_fetch_and_add(&time_seed_2, tmp_time);
+	a->time_seed_2 = realtime_msec() - tmp_time;
 	tmp_time = realtime_msec();
 #endif
 	// 3. third pass: LAST-like
@ -192,8 +190,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len,
 		}
 	}
 #ifdef SHOW_PERF
-	tmp_time = realtime_msec() - tmp_time;
-	__sync_fetch_and_add(&time_seed_3, tmp_time);
+	a->time_seed_3 += realtime_msec() - tmp_time;
 #endif

 #ifdef FILTER_FULL_MATCH
@ -847,8 +844,7 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 				a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0], aux->sw_buf);
 #endif
 #ifdef SHOW_PERF
-				tmp_time = realtime_msec() - tmp_time;
-				__sync_fetch_and_add(&time_ksw_extend2, tmp_time);
+				aux->time_bsw += realtime_msec() - tmp_time;
 #endif
 				if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
 				if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
@ -888,8 +884,7 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 				a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1], aux->sw_buf);
 #endif
 #ifdef SHOW_PERF
-				tmp_time = realtime_msec() - tmp_time;
-				__sync_fetch_and_add(&time_ksw_extend2, tmp_time);
+				aux->time_bsw += realtime_msec() - tmp_time;
 #endif
 				if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
 				if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
@ -1338,6 +1333,7 @@ static smem_aux_t *smem_aux_init()
 	a->tmpv[1] = calloc(1, sizeof(bwtintv_v));
 	a->sw_buf = calloc(1, sizeof(buf_t));
 	a->seq_buf = calloc(1, sizeof(buf_t));
+	a->time_seed_1 = a->time_seed_2 = a->time_seed_3 = a->time_bsw = a->time_sa = 0;
 	return a;
 }

@ -1352,50 +1348,414 @@ static void smem_aux_destroy(smem_aux_t *a)
 // 初始化线程需要的数据
 mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac)
 {
-	int i;
+	int i = opt->n_threads, j;
 	mem_worker_t *w = calloc(1, sizeof(mem_worker_t));
 	w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt;
-	w->aux = malloc(opt->n_threads * sizeof(smem_aux_t));
-	for (i = 0; i < opt->n_threads; ++i)
+	w->calc_isize = 0; w->n = 0; w->regs = 0;
+	w->aux = malloc(i * sizeof(smem_aux_t));
+	w->smem_arr = malloc(i * sizeof(smem_v *));
+	w->chain_arr = malloc(i * sizeof(mem_chain_v *));
+	w->isize_arr = malloc(i * sizeof(uint64_v *));
+
+	for (i = 0; i < opt->n_threads; ++i) {
 		w->aux[i] = smem_aux_init();
+		w->smem_arr[i] = malloc(opt->batch_size * sizeof(smem_v));
+		w->chain_arr[i] = malloc(opt->batch_size * sizeof(mem_chain_v));
+		w->isize_arr[i] = calloc(4, sizeof(uint64_v));
+		for (j = 0; j < opt->batch_size; ++j) {
+			kv_init(w->smem_arr[i][j].mem);
+			kv_init(w->smem_arr[i][j].pos_arr);
+			kv_init(w->chain_arr[i][j]);
+		}
+	}
 	return w;
 }

+static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a)
+{
+	int i, k, x = 0, old_n;
+	int start_width = 1;
+	int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+	int max_seed_len = 0;
+	int start_N_num = 0, start_flag = 1;
+	smem->mem.n = 0;
+
+#ifdef SHOW_PERF
+	int64_t tmp_time = realtime_msec();
+#endif
+	// 1. first pass: find all SMEMs
+	while (x < len) {
+		if (seq[x] < 4) {
+			start_flag = 0;
+			x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, &a->mem1, a->tmpv[0]);
+			for (i = 0; i < a->mem1.n; ++i) {
+				bwtintv_t *p = &a->mem1.a[i];
+				int slen = (uint32_t)p->info - (p->info >> 32); // seed length
+				max_seed_len = MAX(max_seed_len, slen);
+				if (slen >= opt->min_seed_len) {
+					kv_push(bwtintv_t, smem->mem, *p);
+				}
+			}
+			//break; // for test full match time
+		} else {
+			++x;
+			if (start_flag) ++start_N_num;
+		}
+	}
+#ifdef SHOW_PERF
+	a->time_seed_1 += realtime_msec() - tmp_time;
+	tmp_time = realtime_msec();
+#endif
+
+#ifdef FILTER_FULL_MATCH
+	if (max_seed_len == len - start_N_num) goto collect_intv_end;
+#endif
+
+	// 2. second pass: find MEMs inside a long SMEM
+	old_n = smem->mem.n;
+	for (k = 0; k < old_n; ++k) {
+		bwtintv_t *p = &smem->mem.a[k];
+		int start = p->info>>32, end = (int32_t)p->info;
+		if (end - start < split_len || p->x[2] > opt->split_width) continue;
+		fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, &a->mem1, a->tmpv[0]);
+		for (i = 0; i < a->mem1.n; ++i) {
+			bwtintv_t *p = &a->mem1.a[i];
+			int slen = (uint32_t)p->info - (p->info >> 32);
+			if (slen >= opt->min_seed_len) {
+				kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
+			}
+		}
+	}
+#ifdef SHOW_PERF
+	a->time_seed_2 += realtime_msec() - tmp_time;
+	tmp_time = realtime_msec();
+#endif
+	// 3. third pass: LAST-like
+	if (opt->max_mem_intv > 0) {
+		x = 0;
+		while (x < len) {
+			if (seq[x] < 4) {
+				bwtintv_t m;
+				x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
+				if (m.x[2] > 0) {
+					kv_push(bwtintv_t, smem->mem, m);
+				}
+			} else ++x;
+		}
+	}
+#ifdef SHOW_PERF
+	a->time_seed_3 += realtime_msec() - tmp_time;
+#endif
+
+#ifdef FILTER_FULL_MATCH
+collect_intv_end:
+#endif
+	//  sort
+	ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
+}
+
+void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv)
+{
+	int i;
+
+	if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
+	mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux);
+	smemv->pos_arr.n = 0;
+	for (i=0; i<smemv->mem.n; ++i) {
+		bwtintv_t *p = &smemv->mem.a[i];
+		int step, count; // seed length
+		int64_t k;
+		if (p->num_match > 0) {
+			uint64_t pos = p->rm[0].rs;
+			if (p->rm[0].reverse)
+				pos = (fmt->l_pac << 1) - 1 - pos;
+			//pos |= 1LL << 63;
+			kv_push(uint64_t, smemv->pos_arr, pos);
+		} else {
+			step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1;
+			for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count)
+			{
+#ifdef SHOW_PERF
+				int64_t tmp_time = realtime_msec();
+#endif
+				uint64_t pos = fmt_sa_offset(fmt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
+#ifdef SHOW_PERF
+				aux->time_sa += realtime_msec() - tmp_time;
+#endif
+				kv_push(uint64_t, smemv->pos_arr, pos);
+			}
+		}
+	}
+}
+
+
+void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, smem_v *smemv, mem_chain_v *chain)
+{
+	int i, j, b, e, l_rep;
+	int64_t l_pac = bns->l_pac;
+	kbtree_t(chn) * tree;
+	chain->n = 0;
+
+	if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
+	tree = kb_init(chn, KB_DEFAULT_SIZE);
+
+#define CHECK_ADD_CHAIN(tmp, lower, upper)                         \
+	int rid, to_add = 0;                                           \
+	mem_chain_t tmp, *lower, *upper;                               \
+	tmp.pos = s.rbeg;                                              \
+	s.qbeg = p->info >> 32;                                        \
+	s.score = s.len = slen;                                        \
+	rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);               \
+	if (rid < 0)                                                   \
+		continue;                                                  \
+	if (kb_size(tree))                                             \
+	{                                                              \
+		kb_intervalp(chn, tree, &tmp, &lower, &upper);             \
+		if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) \
+			to_add = 1;                                            \
+	}                                                              \
+	else                                                           \
+		to_add = 1;                                                \
+	if (to_add)                                                    \
+	{                                                              \
+		tmp.n = 1;                                                 \
+		tmp.m = 4;                                                 \
+		tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));             \
+		tmp.seeds[0] = s;                                          \
+		tmp.rid = rid;                                             \
+		tmp.is_alt = !!bns->anns[rid].is_alt;                      \
+		kb_putp(chn, tree, &tmp);                                  \
+	}
+
+	for (i = 0, b = e = l_rep = 0; i < smemv->mem.n; ++i) { // compute frac_rep
+		bwtintv_t *p = &smemv->mem.a[i];
+		int sb = (p->info >> 32), se = (uint32_t)p->info;
+		if (p->x[2] <= opt->max_occ) continue;
+		if (sb > e) l_rep += e - b, b = sb, e = se;
+		else e = e > se ? e : se;
+	}
+	l_rep += e - b;
+	for (i = 0, j = 0; i < smemv->mem.n; ++i) {
+		bwtintv_t *p = &smemv->mem.a[i];
+		int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length
+		int64_t k;
+		if (p->num_match > 0) {
+			mem_seed_t s;
+			s.rbeg = smemv->pos_arr.a[j++];
+			CHECK_ADD_CHAIN(tmp, lower, upper);
+		} else {
+			step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1;
+			for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count)
+			{
+				mem_seed_t s;
+				uint64_t sa = smemv->pos_arr.a[j++];
+				uint64_t sa_idx = sa << 16 >> 16;
+				s.rbeg = (sa >> 48) + bwt_get_sa((uint8_t *)fmt->sa, sa_idx);
+				CHECK_ADD_CHAIN(tmp, lower, upper);
+			}
+		}
+	}
+	if (chain->m < kb_size(tree)) {
+		kv_resize(mem_chain_t, *chain, kb_size(tree));
+	}
+
+#define traverse_func(p_) (chain->a[chain->n++] = *(p_))
+	__kb_traverse(mem_chain_t, tree, traverse_func);
+#undef traverse_func
+
+	for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len;
+	if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
+	kb_destroy(chn, tree);
+}
+
+static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
+{
+	int64_t p2;
+	int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
+	p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
+	*dist = p2 > b1 ? p2 - b1 : b1 - p2;
+	return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3);
+}
+
+static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
+{
+	int j;
+	for (j = 1; j < r->n; ++j) { // choose unique alignment
+		int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
+		int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
+		if (e_min > b_max) { // have overlap
+			int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
+			if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
+		}
+	}
+	return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
+}
+// mem主要流程
+void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac,
+					  bseq1_t *seq_arr, int nseq, smem_aux_t *aux, smem_v *smem_arr, mem_chain_v *chain_arr, mem_alnreg_v *reg_arr,
+					  int calc_isize, int64_t l_pac, uint64_v *isize)
+{
+
+	int i, j, l_seq;
+	mem_chain_v *chnp;
+	mem_alnreg_v *regp;
+	char *seq;
+
+	// 1. seeding
+	for (i = 0; i < nseq; ++i) {
+		seq = seq_arr[i].seq;
+		l_seq = seq_arr[i].l_seq;
+		for (j = 0; j < l_seq; ++j) {
+			seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]];
+		}
+		find_smem(opt, fmt, l_seq, (uint8_t *)seq, aux, &smem_arr[i]);
+	}
+
+	// 2. chain
+	for (i=0; i<nseq; ++i) {
+		seq = seq_arr[i].seq;
+		l_seq = seq_arr[i].l_seq;
+		chnp = chain_arr + i;
+		generate_chain(opt, fmt, bns, l_seq, (uint8_t*)seq, &smem_arr[i], chnp);
+		chnp->n = mem_chain_flt(opt, chnp->n, chnp->a);
+		mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a, aux->seq_buf);
+		if (bwa_verbose >= 4) mem_print_chain(bns, chnp);
+	}
+
+	// 3. align
+	for (i=0; i<nseq; ++i) {
+		seq = seq_arr[i].seq;
+		l_seq = seq_arr[i].l_seq;
+		chnp = chain_arr + i;
+		regp = reg_arr + i;
+		kv_init(*regp);
+#ifdef SHOW_PERF
+		int64_t tmp_time = realtime_msec();
+#endif
+		for(j=0; j<chnp->n; ++j) {
+			mem_chain_t *p = &chnp->a[j];
+			if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", j);
+			mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux);
+			free(chnp->a[j].seeds);
+		}
+#ifdef SHOW_PERF
+		tmp_time = realtime_msec() - tmp_time;
+		__sync_fetch_and_add(&time_ksw_extend2, tmp_time);
+#endif
+		free(chnp->a); chnp->m = 0; chnp->a = 0;
+		regp->n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t *)seq, regp->n, regp->a);
+		if (bwa_verbose >= 4) {
+			err_printf("* %ld chains remain after removing duplicated chains\n", regp->n);
+			for (j = 0; j < regp->n; ++j) {
+				mem_alnreg_t *p = &regp->a[j];
+				printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
+			}
+		}
+		for (j = 0; j < regp->n; ++j) {
+			mem_alnreg_t *p = &regp->a[j];
+			if (p->rid >= 0 && bns->anns[p->rid].is_alt)
+				p->is_alt = 1;
+		}
+	}
+
+#if 1
+	// 4. calc insert size
+#define MIN_RATIO     0.8
+	if (calc_isize) {
+		for (i = 0; i < nseq>>1; ++i) {
+			int dir;
+			int64_t is;
+			mem_alnreg_v *r[2];
+			r[0] = (mem_alnreg_v*)&reg_arr[i<<1|0];
+			r[1] = (mem_alnreg_v*)&reg_arr[i<<1|1];
+			if (r[0]->n == 0 || r[1]->n == 0) continue;
+			if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
+			if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
+			if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
+			dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
+			if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
+		}
+	}
+#endif
+}
+// 找smem，生成chain，然后align
+static void worker_smem_align(void *data, int i, int tid)
+{
+	mem_worker_t *w = (mem_worker_t *)data;
+	int start = i * w->opt->batch_size;
+	int end = MIN(start + w->opt->batch_size, w->n_reads);
+	mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid]);
+}
+
+static void worker_sam(void *data, int i, int tid)
+{
+	extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
+	extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
+    extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size);
+	mem_worker_t *w = (mem_worker_t*)data;
+	int start = i*w->opt->batch_size;
+	int end = MIN(start + w->opt->batch_size, w->n_reads);
+
+	if (!(w->opt->flag&MEM_F_PE)) {
+		for (i=start; i<end; ++i) {
+			if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
+			mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
+			if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
+			mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
+			free(w->regs[i].a);
+		}
+	} else {
+	    if (!(w->opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
+			mem_matesw_batch(w->opt, w->bns, w->pac, w->pes, &w->seqs[start], &w->regs[start], end-start);
+        }
+
+		for (i=start; i<end; i+=2) {
+			if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name);
+			mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i]);
+			free(w->regs[i].a); free(w->regs[i+1].a);
+		}
+	}
+}
+
 void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
 {
 	extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
 	mem_pestat_t pes[4];
+	int batch_size = opt->batch_size; // 对于pair-end数据，必须是2的整数倍，因为要保证pair-end数据在同一个线程里
+	int n_batch = (n + batch_size - 1) / batch_size;
 	double ctime, rtime;
-
 	ctime = cputime(); rtime = realtime();
 	global_bns = w->bns;

+	w->opt = opt;
 	if (w->n < n) {
 		w->n = n;
 		w->regs = realloc(w->regs, n * sizeof(mem_alnreg_v));
 	}
 	w->seqs = seqs; w->n_processed = n_processed;
-	w->pes = &pes[0];
-
-#ifdef SHOW_PERF
+	w->n_reads = n; w->pes = &pes[0];
+#if 1
+	if ((opt->flag & MEM_F_PE) && !pes0) { // infer insert sizes if not provided
+		int i, j;
+		w->calc_isize = 1;
+		for (i = 0; i < opt->n_threads; ++i)
+			for (j = 0; j < 4; ++j)
+				w->isize_arr[i][j].n = 0;
+	}
+#endif
 	int64_t tmp_time = realtime_msec();
-#endif
-	kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
-#ifdef SHOW_PERF
-	tmp_time = realtime_msec() - tmp_time;
-	__sync_fetch_and_add(&time_work1, tmp_time);
+	//kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
+	kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
+	time_work1 += realtime_msec() - tmp_time;
 	tmp_time = realtime_msec();
-#endif
-
 	if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
 		if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
-		else mem_pestat(opt, w->bns->l_pac, n, w->regs, pes); // otherwise, infer the insert size distribution from data
+		else mem_pestat(opt, w->bns->l_pac, n, w->isize_arr, pes); // otherwise, infer the insert size distribution from data
+		//else mem_pestat_old(opt, w->bns->l_pac, n, w->regs, pes);
 	}
 	kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
-#ifdef SHOW_PERF
-	tmp_time = realtime_msec() - tmp_time;
-	__sync_fetch_and_add(&time_work2, tmp_time);
-#endif
+	//kt_for(opt->n_threads, worker_sam, w, n_batch);
+	time_work2 += realtime_msec() - tmp_time;
 	//free(w.regs);
 	if (bwa_verbose >= 3)
 		fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
--- a/bwamem.h
+++ b/bwamem.h
@ -72,6 +72,7 @@ typedef struct {
 	int max_occ;            // skip a seed if its occurence is larger than this value
 	int max_chain_gap;      // do not chain seed if it is max_chain_gap-bp away from the closest seed
 	int n_threads;          // number of threads
+	int batch_size;			// batch size of seqs to process at one time
 	int chunk_size;         // process chunk_size-bp sequences in a batch
 	float mask_level;       // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
 	float drop_ratio;       // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
@ -144,9 +145,20 @@ typedef struct { size_t n, m; mem_chain_t *a;  } mem_chain_v;
 typedef struct {
 	bwtintv_v mem, mem1, *tmpv[2];
 	buf_t *sw_buf, *seq_buf;
+	int64_t time_seed_1;
+	int64_t time_seed_2;
+	int64_t time_seed_3;
+	int64_t time_sa;
+	int64_t time_bsw;
 } smem_aux_t;

 typedef struct {
+	bwtintv_v mem;
+	uint64_v pos_arr;
+} smem_v;
+
+typedef struct {
+	int calc_isize;
 	const mem_opt_t *opt;
 	const bwt_t *bwt;
 	const FMTIndex *fmt;
@ -155,12 +167,15 @@ typedef struct {
 	const mem_pestat_t *pes;
 	smem_aux_t **aux;
 	bseq1_t *seqs;
+	smem_v **smem_arr;
+	mem_chain_v **chain_arr;
 	mem_alnreg_v *regs;
+	uint64_v **isize_arr;
 	int64_t n_processed;
 	int64_t n;
+	int64_t n_reads;
 } mem_worker_t;

-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -243,7 +258,8 @@ extern "C" {
 	 * @param regs   region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
 	 * @param pes    inferred insert size distribution (output)
 	 */
-	void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
+	void mem_pestat_old(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
+	void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, uint64_v **isize_arr, mem_pestat_t pes[4]);

 #ifdef __cplusplus
 }
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@ -46,6 +46,29 @@
 #define MAPPING_BOUND 3.0
 #define MAX_STDDEV    4.0

+typedef struct {
+    int is_rev;
+    int qlen;
+    int tlen;
+    int xtra;
+	int64_t rb;
+    uint8_t *seq;
+    uint8_t *ref;
+	const mem_alnreg_t *a;
+    mem_alnreg_v *ma;
+} align_arg_t;
+
+typedef kvec_t(align_arg_t) align_arg_v;
+
+typedef struct _matesw_node {
+    int l_seq;
+	int b_offset;
+    uint8_t *seq;
+    mem_alnreg_v *b;
+    mem_alnreg_v *a;
+    struct _matesw_node *next;
+} matesw_node;
+
 static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
 {
 	int64_t p2;
@ -69,7 +92,66 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
 	return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
 }

-void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
+void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, uint64_v **isize_arr, mem_pestat_t pes[4])
+{
+	int i, j, d, max;
+	uint64_v isize[4];
+	memset(pes, 0, 4 * sizeof(mem_pestat_t));
+	memset(isize, 0, sizeof(kvec_t(int)) * 4);
+	for (i=0; i<opt->n_threads; ++i) {
+		for (d=0; d<4; ++d) {
+			for (j=0; j<isize_arr[i][d].n; ++j) {
+				kv_push(uint64_t, isize[d], isize_arr[i][d].a[j]);
+			}
+		}
+	}
+	if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
+	for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
+		mem_pestat_t *r = &pes[d];
+		uint64_v *q = &isize[d];
+		int p25, p50, p75, x;
+		if (q->n < MIN_DIR_CNT) {
+			fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+			r->failed = 1;
+			free(q->a);
+			continue;
+		} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+		ks_introsort_64(q->n, q->a);
+		p25 = q->a[(int)(.25 * q->n + .499)];
+		p50 = q->a[(int)(.50 * q->n + .499)];
+		p75 = q->a[(int)(.75 * q->n + .499)];
+		r->low  = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
+		if (r->low < 1) r->low = 1;
+		r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
+		fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
+		fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
+		for (i = x = 0, r->avg = 0; i < q->n; ++i)
+			if (q->a[i] >= r->low && q->a[i] <= r->high)
+				r->avg += q->a[i], ++x;
+		r->avg /= x;
+		for (i = 0, r->std = 0; i < q->n; ++i)
+			if (q->a[i] >= r->low && q->a[i] <= r->high)
+				r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
+		r->std = sqrt(r->std / x);
+		fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
+		r->low  = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
+		r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
+		if (r->low  > r->avg - MAX_STDDEV * r->std) r->low  = (int)(r->avg - MAX_STDDEV * r->std + .499);
+		if (r->high < r->avg + MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
+		if (r->low < 1) r->low = 1;
+		fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
+		free(q->a);
+	}
+	for (d = 0, max = 0; d < 4; ++d)
+		max = max > isize[d].n? max : isize[d].n;
+	for (d = 0; d < 4; ++d)
+		if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
+			pes[d].failed = 1;
+			fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+		}
+}
+
+void mem_pestat_old(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
 {
 	int i, d, max;
 	uint64_v isize[4];
@ -205,6 +287,123 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 	return n;
 }

+static inline void add_aln(const mem_opt_t *opt, const bntseq_t *bns, align_arg_t *arg, kswr_t *aln)
+{
+	extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+
+	mem_alnreg_t b;
+	int is_rev = arg->is_rev;
+	const mem_alnreg_t *a = arg->a;
+	int64_t l_pac = bns->l_pac;
+	int64_t rb = arg->rb;
+	mem_alnreg_v *ma = arg->ma;
+	int i, tmp;
+
+	memset(&b, 0, sizeof(mem_alnreg_t));
+	b.rid = a->rid;
+	b.is_alt = a->is_alt;
+	b.qb = is_rev? arg->qlen - (aln->qe + 1) : aln->qb;                                                                                                                                                                              
+	b.qe = is_rev? arg->qlen - aln->qb : aln->qe + 1;
+	b.rb = is_rev? (l_pac<<1) - (rb + aln->te + 1) : rb + aln->tb;
+	b.re = is_rev? (l_pac<<1) - (rb + aln->tb) : rb + aln->te + 1;
+	b.score = aln->score;
+	b.csub = aln->score2;
+	b.secondary = -1;
+	b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
+	kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+	// move b s.t. ma is sorted
+	for (i = 0; i < ma->n - 1; ++i) // find the insertion point
+		if (ma->a[i].score < b.score) break;
+	tmp = i;
+	for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+	ma->a[i] = b;
+	ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+}
+
+void ksw_align2_wrap(const mem_opt_t *opt, const bntseq_t *bns, align_arg_t *arg)
+{
+	kswr_t aln;
+    aln = ksw_align2(arg->qlen, arg->seq, arg->tlen, arg->ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, arg->xtra, 0);
+	if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
+		add_aln(opt, bns, arg, &aln);
+	}
+	if (arg->is_rev) free(arg->seq);
+	free(arg->ref);
+}
+
+void ksw_align2_wrap2(const mem_opt_t *opt, const bntseq_t *bns, align_arg_t *arg1, align_arg_t *arg2)
+{
+	kswr_t aln1, aln2;
+    aln1 = ksw_align2(arg1->qlen, arg1->seq, arg1->tlen, arg1->ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, arg1->xtra, 0);
+	aln2 = ksw_align2(arg2->qlen, arg2->seq, arg2->tlen, arg2->ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, arg2->xtra, 0);
+	if (aln1.score >= opt->min_seed_len && aln1.qb >= 0) { // something goes wrong if aln.qb < 0
+		add_aln(opt, bns, arg1, &aln1);
+	}
+	if (aln2.score >= opt->min_seed_len && aln2.qb >= 0) {
+		add_aln(opt, bns, arg2, &aln2);
+	}
+	if (arg1->is_rev) free(arg1->seq);
+	if (arg2->is_rev) free(arg2->seq);
+	free(arg1->ref);
+	free(arg2->ref);
+}
+
+int mem_matesw_new(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma, align_arg_v *argv)
+{
+	int64_t l_pac = bns->l_pac;
+	int i, r, skip[4], n = 0, rid;
+	for (r = 0; r < 4; ++r)
+		skip[r] = pes[r].failed? 1 : 0;
+	for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
+		int64_t dist;
+		r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
+		if (dist >= pes[r].low && dist <= pes[r].high)
+			skip[r] = 1;
+	}
+	if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
+	for (r = 0; r < 4; ++r) {
+		int is_rev, is_larger;
+		uint8_t *seq, *rev = 0, *ref = 0;
+		int64_t rb, re;
+		if (skip[r]) continue;
+		is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
+		is_larger = !(r>>1); // whether the mate has larger coordinate
+		if (!is_rev) {
+			rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
+			re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
+		} else {
+			rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
+			re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
+		}
+		if (rb < 0) rb = 0;
+		if (re > l_pac<<1) re = l_pac<<1;
+		if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
+		if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening
+			if (is_rev) {
+				rev = malloc(l_ms); // this is the reverse complement of $ms
+				for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
+				seq = rev;
+			} else seq = (uint8_t*)ms;
+			int xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
+			align_arg_t *arg = kv_pushp(align_arg_t, *argv);
+
+    		arg->is_rev = is_rev;
+    		arg->qlen = l_ms;
+    		arg->tlen = re - rb;
+    		arg->xtra = xtra;
+			arg->rb = rb;
+    		arg->seq = seq;
+    		arg->ref = ref;
+			arg->a = a;
+    		arg->ma = ma;
+			++n;
+		} else {
+			free(ref);
+		}
+	}
+	return n;
+}
+
 int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
 {
 	pair64_v v, u;
@ -268,6 +467,81 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons
 	return ret;
 }

+#define NEW_NODE(bidx, aidx) \
+    if (barr[bidx].n) { \
+        matesw_node *node; \
+        tail->next = malloc(sizeof(matesw_node)); \
+        node = tail->next; \
+        node->l_seq = s[aidx].l_seq; \
+		node->b_offset = 0; \
+        node->seq = (uint8_t*)s[aidx].seq; \
+        node->b = barr + bidx; \
+        node->a = a + aidx; \
+        node->next = 0; \
+        tail = node; \
+    }
+
+void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size)
+{
+    int i, j;
+	align_arg_v argv;
+    mem_alnreg_v *barr = calloc(data_size, sizeof(mem_alnreg_v));
+    matesw_node head;
+    matesw_node *tail = &head, *pre = &head;
+    tail->next = 0;
+	kv_init(argv);
+    for (i = 0; i < data_size; i+=2) {
+        for (j = 0; j < a[i].n; ++j) {
+            if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired && j < opt->max_matesw) {
+                kv_push(mem_alnreg_t, barr[i], a[i].a[j]);
+            }
+        }
+		NEW_NODE(i, i+1);
+        for (j = 0; j < a[i+1].n; ++j) {
+            if (a[i+1].a[j].score >= a[i+1].a[0].score - opt->pen_unpaired && j < opt->max_matesw) {
+                kv_push(mem_alnreg_t, barr[i+1], a[i+1].a[j]);
+            }
+        }
+		NEW_NODE(i+1, i);
+    }
+
+    tail = head.next;
+    while(tail) {
+        argv.n = 0;
+        while (tail) {
+			int nn = 0;
+			while (tail->b_offset < tail->b->n && !nn) {
+				nn = mem_matesw_new(opt, bns, pac, pes, &tail->b->a[tail->b_offset], tail->l_seq, tail->seq, tail->a, &argv);
+				++ tail->b_offset;
+			}
+			if (!nn || tail->b_offset == tail->b->n) { // delete node
+				tail = tail->next;
+                free(pre->next);
+                pre->next = tail;
+			} else {
+				pre = tail;
+                tail = tail->next;
+			}
+        }
+		if (argv.n) {
+			for (i=0; i<argv.n-1; i+=2) {
+				ksw_align2_wrap2(opt, bns, &argv.a[i], &argv.a[i+1]);
+			}
+			if (i < argv.n) {
+				ksw_align2_wrap(opt, bns, &argv.a[i]);
+			}
+		}
+        tail = head.next;
+        pre = &head;
+    }
+
+    for (i=0; i<data_size; ++i) {
+        free(barr[i].a);
+    }
+    free(barr);
+	free(argv.a);
+}
+
 void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
 void mem_reorder_primary5(int T, mem_alnreg_v *a);

--- a/ert.sh
+++ b/ert.sh
@ -0,0 +1,14 @@
+thread=32
+n_r1=~/fastq/na12878/na12878_r1.fq
+n_r2=~/fastq/na12878/na12878_r2.fq
+#n_r1=~/fastq/ZY2105177532213010_L4_1.fq
+#n_r2=~/fastq/ZY2105177532213010_L4_2.fq
+reference=~/reference/ert/human_g1k_v37_decoy.fasta
+out=/dev/null
+time /home/zzh/work/ert-bwa-mem2/bwa-mem2 mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
+    -Z $reference \
+    $n_r1 \
+    $n_r2 \
+    -o $out
+
+
--- a/fastmap.c
+++ b/fastmap.c
@ -183,7 +183,7 @@ int main_mem(int argc, char *argv[])

 	aux.opt = opt = mem_opt_init();
 	memset(&opt0, 0, sizeof(mem_opt_t));
-	while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
+	while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:b:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
 		if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
 		else if (c == '1') no_mt_io = 1;
 		else if (c == 'x') mode = optarg;
@ -193,6 +193,7 @@ int main_mem(int argc, char *argv[])
 		else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
 		else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
 		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
+		else if (c == 'b') opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1? opt->batch_size : 512;
 		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
 		else if (c == 'a') opt->flag |= MEM_F_ALL;
 		else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
@ -291,11 +292,15 @@ int main_mem(int argc, char *argv[])
 	}

 	if (opt->n_threads < 1) opt->n_threads = 1;
+	if (opt->batch_size < 1) opt->batch_size = 512;
+	fprintf(stderr, "batch_size: %d\n", opt->batch_size);
+
 	if (optind + 1 >= argc || optind + 3 < argc) {
 		fprintf(stderr, "\n");
 		fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
 		fprintf(stderr, "Algorithm options:\n\n");
 		fprintf(stderr, "       -t INT        number of threads [%d]\n", opt->n_threads);
+		fprintf(stderr, "       -b INT        batch size of reads to process at one time [%d]\n", opt->batch_size);
 		fprintf(stderr, "       -k INT        minimum seed length [%d]\n", opt->min_seed_len);
 		fprintf(stderr, "       -w INT        band width for banded alignment [%d]\n", opt->w);
 		fprintf(stderr, "       -d INT        off-diagonal X-dropoff [%d]\n", opt->zdrop);
@ -436,15 +441,23 @@ int main_mem(int argc, char *argv[])
 	}

 #ifdef SHOW_PERF
+	int64_t avg_seed_1 = 0, avg_seed_2 = 0, avg_seed_3 = 0, avg_sa = 0, avg_bsw = 0;
+	for (i = 0; i < opt->n_threads; ++i) {
+		avg_seed_1 += aux.w->aux[i]->time_seed_1;
+		avg_seed_2 += aux.w->aux[i]->time_seed_2;
+		avg_seed_3 += aux.w->aux[i]->time_seed_3;
+		avg_sa += aux.w->aux[i]->time_sa;
+		avg_bsw += aux.w->aux[i]->time_bsw;
+	}
 	fprintf(stderr, "\n");
-	fprintf(stderr, "time_work1: %f s\n", time_work1 / 1000.0 / 1);
-	fprintf(stderr, "time_work2: %f s\n", time_work2 / 1000.0 / 1);
-	fprintf(stderr, "time_flt_chain: %f s\n", time_flt_chain / 1000.0 / opt->n_threads);
-	fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads);
-	fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads);
-	fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads);
-	fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads);
-	fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads);
+	fprintf(stderr, "time_work1: %f s\n", time_work1 / 1000.0);
+	fprintf(stderr, "time_work2: %f s\n", time_work2 / 1000.0);
+	fprintf(stderr, "time_process_seq: %f s\n", (time_work1 + time_work2) / 1000.0);
+	fprintf(stderr, "time_seed_1: %f s\n", avg_seed_1 / 1000.0 / opt->n_threads);
+	fprintf(stderr, "time_seed_2: %f s\n", avg_seed_2 / 1000.0 / opt->n_threads);
+	fprintf(stderr, "time_seed_3: %f s\n", avg_seed_3 / 1000.0 / opt->n_threads);
+	fprintf(stderr, "time_bwt_sa: %f s\n", avg_sa / 1000.0 / opt->n_threads);
+	fprintf(stderr, "time_bsw: %f s\n", avg_bsw / 1000.0 / opt->n_threads);
 	fprintf(stderr, "\n");

 	fclose(gfp1);
--- a/fmt_idx.c
+++ b/fmt_idx.c
@ -1005,4 +1005,25 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
    }
    sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
    return sa;
+}
+
+bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k)
+{
+    bwtint_t sa = 0, mask = fmt->sa_intv - 1;
+    bwtint_t k1, k2;
+    while (k & mask)
+    {
+        ++sa;
+        fmt_previous_line(fmt, k, &k1, &k2);
+        __builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
+        if (!(k1 & mask))
+        {
+            k = k1;
+            break;
+        }
+        ++sa;
+        k = k2;
+    }
+    sa = (sa << 48) | (k / fmt->sa_intv);
+    return sa;
 }
--- a/fmt_idx.h
+++ b/fmt_idx.h
@ -112,5 +112,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
 int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);

 bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k);
+bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k);

 #endif
--- a/ksw_extend2_avx2_u8.c
+++ b/ksw_extend2_avx2_u8.c
@ -436,7 +436,7 @@ int ksw_extend2_avx2_u8(int qlen, // query length  待匹配段碱基的query长
 		SWAP_DATA_POINTER;
 	}

-//	free(mem);
+	//free(mem);
    if (_qle) *_qle = max_j + 1;
    if (_tle) *_tle = max_i + 1;
    if (_gtle) *_gtle = max_ie + 1;
--- a/mem2.sh
+++ b/mem2.sh
@ -0,0 +1,12 @@
+thread=32
+n_r1=~/fastq/na12878/na12878_r1.fq
+n_r2=~/fastq/na12878/na12878_r2.fq
+reference=~/reference/mem2/human_g1k_v37_decoy.fasta
+out=/dev/null
+time /home/zzh/work/bwa-mem2/bwa-mem2 mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
+    $reference \
+    $n_r1 \
+    $n_r2 \
+    -o $out
+
+
--- a/na.sh
+++ b/na.sh
@ -0,0 +1,4 @@
+time ./bwa.sh &> bwa-na12878.log
+time ./sbwa.sh &> sbwa-na12878.log
+time ./mem2.sh &> mem2-na12878.log
+time ./ert.sh &> ert-na12878.log
--- a/run.sh
+++ b/run.sh
@ -1,10 +1,12 @@
 thread=32
-n_r1=~/fastq/ZY2105177532213010_L4_1.fq
-n_r2=~/fastq/ZY2105177532213010_L4_2.fq
+#n_r1=~/fastq/ZY2105177532213010_L4_1.fq
+#n_r2=~/fastq/ZY2105177532213010_L4_2.fq
 #n_r1=~/fastq/na12878/nas_1.fq
 #n_r2=~/fastq/na12878/nas_2.fq
-#n_r1=~/fastq/na12878/na_1.fq
-#n_r2=~/fastq/na12878/na_2.fq
+n_r1=~/data/fastq/na12878/na_1.fq
+n_r2=~/data/fastq/na12878/na_2.fq
+#n_r1=~/fastq/na12878/na12878_r1.fq
+#n_r2=~/fastq/na12878/na12878_r2.fq
 #n_r1=~/fastq/n_r1.fq
 #n_r2=~/fastq/n_r2.fq
 #n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq
@ -24,7 +26,7 @@ n_r2=~/fastq/ZY2105177532213010_L4_2.fq
 #n_r2=~/fastq/diff_r2.fq
 #n_r1=~/fastq/d_r1.fq
 #n_r2=~/fastq/d_r2.fq
-reference=~/reference/human_g1k_v37_decoy.fasta
+reference=~/data/reference/human_g1k_v37_decoy.fasta
 #out=./all.sam
 #out=./sn.sam
 #out=./ssn-x1.sam
@ -42,7 +44,7 @@ out=/dev/null
 #    /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
 #    -o /dev/null

-time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
+time ./bwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
    $reference \
    $n_r1 \
    $n_r2 \
--- a/sbwa.sh
+++ b/sbwa.sh
@ -0,0 +1,12 @@
+thread=32
+n_r1=~/fastq/na12878/na12878_r1.fq
+n_r2=~/fastq/na12878/na12878_r2.fq
+reference=~/reference/human_g1k_v37_decoy.fasta
+out=/dev/null
+time /home/zzh/work/sbwa/bwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
+    $reference \
+    $n_r1 \
+    $n_r2 \
+    -o $out
+
+
--- a/utils.h
+++ b/utils.h
@ -46,7 +46,7 @@ extern int64_t time_ksw_extend2,
 	time_bwt_sa,
 	time_bwt_sa_read,
 	time_bns,
-	time_work1,
+time_work1,
 	time_work2,
 	time_flt_chain;