添加了SAM_EXACT宏定义,可以选择结果精确(与bwa一致),或者更快一点,容许不一致,其实没有损失精度,结果不一致只是bwa里的排序不稳定导致的
This commit is contained in:
parent
3dfff494f7
commit
2c1f58fb50
2
Makefile
2
Makefile
|
|
@ -9,7 +9,7 @@ FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
|||
USE_MT_READ= -DUSE_MT_READ
|
||||
|
||||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) $(SAM_EXACT) -DUSE_AVX2 -DKSW_EQUAL
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
|
||||
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o yarn.o
|
||||
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
|
|
|
|||
145
paired_sam.c
145
paired_sam.c
|
|
@ -367,7 +367,6 @@ static int check_add_align(const mem_opt_t* opt, kswr_avx_t aln, int is_rev, int
|
|||
mem_alnreg_v* ma, int64_t rb) {
|
||||
int res = 0;
|
||||
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
|
||||
int i, tmp;
|
||||
mem_alnreg_t b;
|
||||
memset(&b, 0, sizeof(mem_alnreg_t));
|
||||
b.rid = a->rid;
|
||||
|
|
@ -383,9 +382,10 @@ static int check_add_align(const mem_opt_t* opt, kswr_avx_t aln, int is_rev, int
|
|||
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev,
|
||||
// is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
|
||||
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
|
||||
|
||||
#ifdef SAM_EXACT
|
||||
// move b s.t. ma is sorted
|
||||
#if 1
|
||||
#if 0
|
||||
int i, tmp;
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score)
|
||||
break;
|
||||
|
|
@ -393,91 +393,8 @@ static int check_add_align(const mem_opt_t* opt, kswr_avx_t aln, int is_rev, int
|
|||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1];
|
||||
ma->a[i] = b;
|
||||
ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||
#else
|
||||
// 改一下上边这个低效的排序合并
|
||||
|
||||
#if 0
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score)
|
||||
break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1];
|
||||
ma->a[i] = b;
|
||||
|
||||
i = 0;
|
||||
mem_alnreg_t* p = &b;
|
||||
int need_merge = 0;
|
||||
while (i < ma->n) {
|
||||
if (i == tmp) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
mem_alnreg_t* q = &ma->a[i];
|
||||
if (p->rid == q->rid && (p->rb < q->re + opt->max_chain_gap || q->rb < p->re + opt->max_chain_gap)) {
|
||||
need_merge = 1;
|
||||
break;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (need_merge) {
|
||||
ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||
}
|
||||
#else
|
||||
mem_alnreg_t* p = &ma->a[ma->n - 1];
|
||||
int merged = 0; // 是否合并的flag
|
||||
int unsort_idx = ma->n - 1; // 新加入的aln还没被排序
|
||||
for (i = 0; i < ma->n - 1; ++i) {
|
||||
mem_alnreg_t* q = &ma->a[i];
|
||||
mem_alnreg_t* pq = &ma->a[i]; // 待合并
|
||||
if (p->rid == q->rid && (p->rb < q->re + opt->max_chain_gap || q->rb < p->re + opt->max_chain_gap)) {
|
||||
merged = 1; // 有覆盖或交叉,需要合并
|
||||
unsort_idx = i;
|
||||
if (p->re < q->re) { // p在前,交换一下,需要q在前,p在后
|
||||
mem_alnreg_t* tmpp = p; p = q; q = tmpp;
|
||||
}
|
||||
int64_t or, oq, mr, mq; // ref overlap, query overlap, min ref len, min query len
|
||||
int score, w;
|
||||
or = q->re - p->rb; // overlap length on the reference
|
||||
oq = q->qb < p->qb ? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
|
||||
mr = q->re - q->rb < p->re - p->rb ? q->re - q->rb : p->re - p->rb; // min ref len in alignment
|
||||
mq = q->qe - q->qb < p->qe - p->qb ? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
|
||||
if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
|
||||
if (p->score < q->score) {
|
||||
if (pq != q)
|
||||
*pq = *q;
|
||||
} else {
|
||||
if (pq != p)
|
||||
*pq = *p;
|
||||
}
|
||||
} else if (q->rb < p->rb && (score = mem_patch_reg(opt, 0, 0, 0, q, p, &w)) > 0) { // then merge q into p
|
||||
pq->n_comp += q->n_comp + 1;
|
||||
pq->seedcov = p->seedcov > q->seedcov ? p->seedcov : q->seedcov;
|
||||
pq->sub = p->sub > q->sub ? p->sub : q->sub;
|
||||
pq->csub = p->csub > q->csub ? p->csub : q->csub;
|
||||
pq->qb = q->qb, pq->rb = q->rb;
|
||||
pq->truesc = pq->score = score;
|
||||
pq->w = w;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 排序
|
||||
if (merged) {
|
||||
ma->n -= 1; // 因为合并了(覆盖或者交叉合并等)
|
||||
b = ma->a[unsort_idx];
|
||||
for (i = unsort_idx; i < ma->n - 1; ++i) {
|
||||
ma->a[i] = ma->a[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score)
|
||||
break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1];
|
||||
ma->a[i] = b;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
res = 1;
|
||||
}
|
||||
return res;
|
||||
|
|
@ -660,8 +577,11 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
endIdx = w->n_reads;
|
||||
|
||||
int id = 0, i = 0, j = 0, k = 0;
|
||||
|
||||
#ifdef SAM_EXACT
|
||||
mem_alnreg_v bb[2];
|
||||
kv_init(bb[0]); kv_init(bb[1]);
|
||||
#endif
|
||||
for (id = startIdx; id < endIdx; id += 2) {
|
||||
// 初始化变量
|
||||
bseq1_t* s = &w->seqs[id];
|
||||
|
|
@ -679,30 +599,41 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
#endif
|
||||
|
||||
int n[2] = {0};
|
||||
#ifndef SAM_EXACT
|
||||
int nn[2] = {0};
|
||||
//
|
||||
#endif
|
||||
|
||||
#ifdef SAM_EXACT
|
||||
_clear_vec(bb[0]);
|
||||
_clear_vec(bb[1]);
|
||||
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < s[!i].msw.n; ++j) {
|
||||
msw_task_t* t = s[!i].msw.a[j];
|
||||
kv_push(mem_alnreg_t, bb[i], a[i].a[t->aj]);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int si = !i;
|
||||
#ifndef SAM_EXACT
|
||||
int origin_n = a[si].n;
|
||||
#endif
|
||||
// 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起
|
||||
for (k = 0; k < s[si].msw.n; ++k) {
|
||||
msw_task_t* t = s[si].msw.a[k];
|
||||
//mem_alnreg_t* b = &a[i].a[t->aj];
|
||||
#ifdef SAM_EXACT
|
||||
mem_alnreg_t* b = &bb[i].a[k];
|
||||
#else
|
||||
mem_alnreg_t* b = &a[i].a[t->aj];
|
||||
#endif
|
||||
mem_alnreg_v* ma = &a[si];
|
||||
uint8_t skip = 0;
|
||||
if (n[si]) { // 添加了新的aln,需要检测skip
|
||||
//for (j = origin_n; j < ma->n; ++j){
|
||||
#ifdef SAM_EXACT
|
||||
for (j = 0; j < ma->n; ++j) { // check which orinentation has been found
|
||||
#else
|
||||
for (j = origin_n; j < ma->n; ++j) {
|
||||
#endif
|
||||
int64_t dist;
|
||||
int r;
|
||||
r = mem_infer_dir(bns->l_pac, b->rb, ma->a[j].rb, &dist);
|
||||
|
|
@ -710,9 +641,9 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
skip |= 1 << r;
|
||||
}
|
||||
}
|
||||
// 检查新添加的aln是不是把当前的任务skip掉了
|
||||
if ((t->skip | skip) == 15)
|
||||
continue;
|
||||
// 检查新添加的aln是不是把当前的任务skip掉了
|
||||
#if 0
|
||||
kswr_avx_t aln = t->aln; // fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id +
|
||||
// w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
|
||||
|
|
@ -721,37 +652,16 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); fflush(stderr);
|
||||
}
|
||||
#endif
|
||||
#ifndef SAM_EXACT
|
||||
nn[si] += 1;
|
||||
#endif
|
||||
n[si] += check_add_align(opt, t->aln, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb);
|
||||
}
|
||||
}
|
||||
// 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序
|
||||
#if 1
|
||||
#ifndef SAM_EXACT
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int tmp;
|
||||
if (nn[i] > 0) {
|
||||
#if 0
|
||||
int origin_n = a[i].n;
|
||||
int start_n = a[i].n - n[i];
|
||||
for (j = 0; j < n[i]; ++j) { // 在这里排序
|
||||
// move b s.t. ma is sorted
|
||||
int nidx = origin_n - n[i] + j;
|
||||
mem_alnreg_t b = a[i].a[nidx]; // 用指针一定要慎重,注意指向的地址内容会变
|
||||
for (k = 0; k < nidx; ++k) // find the insertion point
|
||||
if (a[i].a[k].score < b.score)
|
||||
break;
|
||||
tmp = k;
|
||||
for (k = start_n; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
|
||||
a[i].a[k] = b;
|
||||
// 每添加一个aln都要sort-dedup一次
|
||||
#if 0
|
||||
start_n = mem_sort_dedup_patch(opt, 0, 0, 0, start_n + 1, a[i].a);
|
||||
a[i].n = start_n;
|
||||
#else
|
||||
start_n += 1;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
|
||||
}
|
||||
}
|
||||
|
|
@ -761,8 +671,11 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
// 这里需要传入全局id
|
||||
generate_sam(opt, bns, pac, pes, (w->n_processed + id) >> 1, s, a, ss, w->n_processed, tid);
|
||||
}
|
||||
|
||||
#ifdef SAM_EXACT
|
||||
_destory_clear_vec(bb[0]);
|
||||
_destory_clear_vec(bb[1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 划分matesw任务
|
||||
|
|
|
|||
Loading…
Reference in New Issue