msw添加aln时,尝试了新的高效率的方式,结果还是差一些,代码保留,等会儿clean一下
This commit is contained in:
parent
e335e79543
commit
3dfff494f7
1
Makefile
1
Makefile
|
|
@ -2,6 +2,7 @@ CC= gcc
|
|||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3
|
||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
|
||||
SAM_EXACT= -DSAM_EXACT
|
||||
SHOW_PERF= -DSHOW_PERF
|
||||
SHOW_DATA_PERF= -DSHOW_DATA_PERF
|
||||
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
||||
|
|
|
|||
10
bwamem.c
10
bwamem.c
|
|
@ -482,15 +482,15 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_
|
|||
int m, i, j;
|
||||
if (n <= 1) return n;
|
||||
ks_introsort(mem_ars2, n, a); // sort by the END position, not START!
|
||||
for (i = 0; i < n; ++i) a[i].n_comp = 1;
|
||||
for (i = 0; i < n; ++i) a[i].n_comp = 1; // 设置包含的chain只有1个
|
||||
for (i = 1; i < n; ++i) {
|
||||
mem_alnreg_t *p = &a[i];
|
||||
if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) continue; // then no need to go into the loop below
|
||||
for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) {
|
||||
mem_alnreg_t *q = &a[j];
|
||||
int64_t or, oq, mr, mq;
|
||||
int score, w;
|
||||
if (q->qe == q->qb) continue; // a[j] has been excluded
|
||||
mem_alnreg_t *q = &a[j]; // 遍历i之前的所有aln
|
||||
int64_t or, oq, mr, mq; // ref overlap, query overlap, min ref len, min query len
|
||||
int score, w;
|
||||
if (q->qe == q->qb) continue; // a[j] has been excluded
|
||||
or = q->re - p->rb; // overlap length on the reference
|
||||
oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
|
||||
mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
|
||||
|
|
|
|||
115
paired_sam.c
115
paired_sam.c
|
|
@ -384,14 +384,101 @@ static int check_add_align(const mem_opt_t* opt, kswr_avx_t aln, int is_rev, int
|
|||
// is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
|
||||
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
|
||||
// move b s.t. ma is sorted
|
||||
#if 1
|
||||
#if 0
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score)
|
||||
break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1];
|
||||
ma->a[i] = b;
|
||||
res = 1;
|
||||
ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||
#else
|
||||
// 改一下上边这个低效的排序合并
|
||||
|
||||
#if 0
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score)
|
||||
break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1];
|
||||
ma->a[i] = b;
|
||||
|
||||
i = 0;
|
||||
mem_alnreg_t* p = &b;
|
||||
int need_merge = 0;
|
||||
while (i < ma->n) {
|
||||
if (i == tmp) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
mem_alnreg_t* q = &ma->a[i];
|
||||
if (p->rid == q->rid && (p->rb < q->re + opt->max_chain_gap || q->rb < p->re + opt->max_chain_gap)) {
|
||||
need_merge = 1;
|
||||
break;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (need_merge) {
|
||||
ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||
}
|
||||
#else
|
||||
mem_alnreg_t* p = &ma->a[ma->n - 1];
|
||||
int merged = 0; // 是否合并的flag
|
||||
int unsort_idx = ma->n - 1; // 新加入的aln还没被排序
|
||||
for (i = 0; i < ma->n - 1; ++i) {
|
||||
mem_alnreg_t* q = &ma->a[i];
|
||||
mem_alnreg_t* pq = &ma->a[i]; // 待合并
|
||||
if (p->rid == q->rid && (p->rb < q->re + opt->max_chain_gap || q->rb < p->re + opt->max_chain_gap)) {
|
||||
merged = 1; // 有覆盖或交叉,需要合并
|
||||
unsort_idx = i;
|
||||
if (p->re < q->re) { // p在前,交换一下,需要q在前,p在后
|
||||
mem_alnreg_t* tmpp = p; p = q; q = tmpp;
|
||||
}
|
||||
int64_t or, oq, mr, mq; // ref overlap, query overlap, min ref len, min query len
|
||||
int score, w;
|
||||
or = q->re - p->rb; // overlap length on the reference
|
||||
oq = q->qb < p->qb ? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
|
||||
mr = q->re - q->rb < p->re - p->rb ? q->re - q->rb : p->re - p->rb; // min ref len in alignment
|
||||
mq = q->qe - q->qb < p->qe - p->qb ? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
|
||||
if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
|
||||
if (p->score < q->score) {
|
||||
if (pq != q)
|
||||
*pq = *q;
|
||||
} else {
|
||||
if (pq != p)
|
||||
*pq = *p;
|
||||
}
|
||||
} else if (q->rb < p->rb && (score = mem_patch_reg(opt, 0, 0, 0, q, p, &w)) > 0) { // then merge q into p
|
||||
pq->n_comp += q->n_comp + 1;
|
||||
pq->seedcov = p->seedcov > q->seedcov ? p->seedcov : q->seedcov;
|
||||
pq->sub = p->sub > q->sub ? p->sub : q->sub;
|
||||
pq->csub = p->csub > q->csub ? p->csub : q->csub;
|
||||
pq->qb = q->qb, pq->rb = q->rb;
|
||||
pq->truesc = pq->score = score;
|
||||
pq->w = w;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 排序
|
||||
if (merged) {
|
||||
ma->n -= 1; // 因为合并了(覆盖或者交叉合并等)
|
||||
b = ma->a[unsort_idx];
|
||||
for (i = unsort_idx; i < ma->n - 1; ++i) {
|
||||
ma->a[i] = ma->a[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score)
|
||||
break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1];
|
||||
ma->a[i] = b;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
res = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
|
@ -581,7 +668,7 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
mem_alnreg_v* a = &w->regs[id];
|
||||
seq_sam_t* ss = &w->sams[id];
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
int cmp = strcmp("ERR194147.1629456", s[0].name) == 0;
|
||||
// int cmp = (id + w->n_processed) == 202924;
|
||||
if (cmp == 1) {
|
||||
|
|
@ -592,7 +679,7 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
#endif
|
||||
|
||||
int n[2] = {0};
|
||||
// , nn[2] = {0};
|
||||
int nn[2] = {0};
|
||||
//
|
||||
_clear_vec(bb[0]);
|
||||
_clear_vec(bb[1]);
|
||||
|
|
@ -609,11 +696,12 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
// 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起
|
||||
for (k = 0; k < s[si].msw.n; ++k) {
|
||||
msw_task_t* t = s[si].msw.a[k];
|
||||
// mem_alnreg_t* b = &a[i].a[t->aj];
|
||||
//mem_alnreg_t* b = &a[i].a[t->aj];
|
||||
mem_alnreg_t* b = &bb[i].a[k];
|
||||
mem_alnreg_v* ma = &a[si];
|
||||
uint8_t skip = 0;
|
||||
if (n[si]) { // 添加了新的aln,需要检测skip
|
||||
//for (j = origin_n; j < ma->n; ++j){
|
||||
for (j = 0; j < ma->n; ++j) { // check which orinentation has been found
|
||||
int64_t dist;
|
||||
int r;
|
||||
|
|
@ -622,11 +710,10 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
skip |= 1 << r;
|
||||
}
|
||||
}
|
||||
// if (skip == 15)
|
||||
if ((t->skip | skip) == 15)
|
||||
continue;
|
||||
// if ((t->skip | skip) != 15) { // 检查新添加的aln是不是把当前的任务skip掉了
|
||||
#if 1
|
||||
// 检查新添加的aln是不是把当前的任务skip掉了
|
||||
#if 0
|
||||
kswr_avx_t aln = t->aln; // fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id +
|
||||
// w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
|
||||
if (cmp) {
|
||||
|
|
@ -634,16 +721,16 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); fflush(stderr);
|
||||
}
|
||||
#endif
|
||||
// nn[si] += 1;
|
||||
nn[si] += 1;
|
||||
n[si] += check_add_align(opt, t->aln, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb);
|
||||
//}
|
||||
}
|
||||
}
|
||||
// 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序
|
||||
#if 0
|
||||
#if 1
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int tmp;
|
||||
if (nn[i] > 0) {
|
||||
#if 0
|
||||
int origin_n = a[i].n;
|
||||
int start_n = a[i].n - n[i];
|
||||
for (j = 0; j < n[i]; ++j) { // 在这里排序
|
||||
|
|
@ -657,15 +744,15 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
for (k = start_n; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
|
||||
a[i].a[k] = b;
|
||||
// 每添加一个aln都要sort-dedup一次
|
||||
#if 1
|
||||
#if 0
|
||||
start_n = mem_sort_dedup_patch(opt, 0, 0, 0, start_n + 1, a[i].a);
|
||||
a[i].n = start_n;
|
||||
#else
|
||||
start_n += 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
// a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
|
||||
#endif
|
||||
a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
@ -674,6 +761,8 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
// 这里需要传入全局id
|
||||
generate_sam(opt, bns, pac, pes, (w->n_processed + id) >> 1, s, a, ss, w->n_processed, tid);
|
||||
}
|
||||
_destory_clear_vec(bb[0]);
|
||||
_destory_clear_vec(bb[1]);
|
||||
}
|
||||
|
||||
// 划分matesw任务
|
||||
|
|
|
|||
17
run.sh
17
run.sh
|
|
@ -1,11 +1,14 @@
|
|||
thread=64
|
||||
thread=1
|
||||
|
||||
make clean; make -j 32
|
||||
|
||||
n1=~/data/dataset/D2/n1.fq.gz
|
||||
n2=~/data/dataset/D2/n2.fq.gz
|
||||
#n1=~/data/dataset/D2/d1.fq
|
||||
#n2=~/data/dataset/D2/d2.fq
|
||||
#n1=~/data/dataset/D2/n1.fq.gz
|
||||
#n2=~/data/dataset/D2/n2.fq.gz
|
||||
n1=~/data/dataset/D2/d1.fq
|
||||
n2=~/data/dataset/D2/d2.fq
|
||||
|
||||
#n1=~/data/SRR25735656_1.fastq.gz
|
||||
#n2=~/data/SRR25735656_2.fastq.gz
|
||||
|
||||
#n1=/file-data/zzh/dataset/D1/45mr1.fq.gz
|
||||
#n2=/file-data/zzh/dataset/D1/45mr2.fq.gz
|
||||
|
|
@ -14,8 +17,8 @@ reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta
|
|||
|
||||
#out=/dev/null
|
||||
#out=./oldsam-D2-out.sam
|
||||
out=./D2-1.sam
|
||||
#out=./d.sam
|
||||
#out=./D2-1.sam
|
||||
out=./d.sam
|
||||
prog=./fastalign
|
||||
#prog=/home/zzh/fastbwa
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue