修正了几个bug,但是目前结果还是不一致,还得继续调试
This commit is contained in:
parent
e997699a47
commit
fb0cd0f663
2
Makefile
2
Makefile
|
|
@ -1,5 +1,5 @@
|
|||
CC= gcc
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw #-O3
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3
|
||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
|
||||
SHOW_PERF= -DSHOW_PERF
|
||||
|
|
|
|||
|
|
@ -16,7 +16,8 @@
|
|||
|
||||
// 需要做mate sw的read
|
||||
typedef struct {
|
||||
int is_rev; // seq是否在反向互补链上
|
||||
uint8_t is_rev; // seq是否在反向互补链上
|
||||
uint8_t skip; // 记录任务创建时候的skip状态
|
||||
int xtra;
|
||||
int64_t rb, re; // ref的起始截止位置,左闭右开
|
||||
int64_t seq_id; // 对应的当前数据块的seq id
|
||||
|
|
|
|||
148
paired_sam.c
148
paired_sam.c
|
|
@ -93,7 +93,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui
|
|||
return; // consistent pair exist; no need to perform SW
|
||||
int task_order = 0;
|
||||
for (r = 0; r < 4; ++r) {
|
||||
int is_rev, is_larger;
|
||||
uint8_t is_rev, is_larger;
|
||||
int64_t rb, re; // 左闭右开
|
||||
if (skip[r])
|
||||
continue;
|
||||
|
|
@ -124,6 +124,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui
|
|||
else
|
||||
p = kv_pushp(msw_task_t, *msw16);
|
||||
p->is_rev = is_rev;
|
||||
p->skip = skip[0] | skip[1] << 1 | skip[2] << 2 | skip[3] << 3;
|
||||
p->xtra = xtra;
|
||||
p->rb = rb;
|
||||
p->re = re;
|
||||
|
|
@ -396,64 +397,21 @@ static int check_add_align(kswr_avx_t aln, int min_seed_len, int is_rev, int64_t
|
|||
|
||||
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
|
||||
|
||||
// 最后再计算并生成sam数据
|
||||
static void workder_gen_sam(void* data, int idx, int tid) {
|
||||
mem_worker_t* w = (mem_worker_t*)data;
|
||||
const mem_opt_t* opt = w->opt;
|
||||
const bntseq_t* bns = w->bns;
|
||||
const uint8_t* pac = w->pac;
|
||||
const mem_pestat_t *pes = w->pes;
|
||||
// 根据比对结果生成sam
|
||||
void generate_sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2],
|
||||
seq_sam_t ss[2], int64_t n_processed, int tid) {
|
||||
|
||||
int startIdx = idx * opt->batch_size;
|
||||
int endIdx = (idx + 1) * opt->batch_size;
|
||||
if (endIdx > w->n_reads)
|
||||
endIdx = w->n_reads;
|
||||
|
||||
int id = 0, i = 0, j = 0, k = 0;
|
||||
|
||||
for (id = startIdx; id < endIdx; id += 2) {
|
||||
// 初始化变量
|
||||
bseq1_t* s = &w->seqs[id];
|
||||
mem_alnreg_v* a = &w->regs[id];
|
||||
seq_sam_t* ss = &w->sams[id];
|
||||
int z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
|
||||
int i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
|
||||
kstring_t str;
|
||||
mem_aln_t h[2], g[2], aa[2][2];
|
||||
|
||||
// int cmp = strcmp("ERR194147.17699", s[0].name);
|
||||
|
||||
str.l = str.m = 0;
|
||||
str.s = 0;
|
||||
memset(h, 0, sizeof(mem_aln_t) * 2);
|
||||
memset(g, 0, sizeof(mem_aln_t) * 2);
|
||||
n_aa[0] = n_aa[1] = 0;
|
||||
|
||||
int n[2] = {0};
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int si = !i;
|
||||
// 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起
|
||||
for (k = 0; k < s[si].msw.n; ++k) {
|
||||
msw_task_t* t = s[si].msw.a[k];
|
||||
// kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
|
||||
|
||||
n[i] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, &a[!si].a[t->aj], s[si].l_seq, (uint8_t*)s[si].seq, &a[si], t->rb);
|
||||
}
|
||||
}
|
||||
// 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int tmp;
|
||||
if (n[i] > 0) {
|
||||
for (j = 0; j < n[i]; ++j) { // 在这里排序
|
||||
// move b s.t. ma is sorted
|
||||
int nidx = a[i].n - n[i] + j;
|
||||
mem_alnreg_t* b = &a[i].a[nidx];
|
||||
for (k = 0; k < nidx; ++k) // find the insertion point
|
||||
if (a[i].a[k].score < b->score)
|
||||
break;
|
||||
tmp = k;
|
||||
for (k = nidx; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
|
||||
a[i].a[k] = *b;
|
||||
}
|
||||
|
||||
a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0);
|
||||
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1);
|
||||
if (opt->flag & MEM_F_PRIMARY5) {
|
||||
|
|
@ -543,7 +501,7 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
}
|
||||
}
|
||||
ss[0].sam.l = 0;
|
||||
//PROF_START(aln2sam);
|
||||
// PROF_START(aln2sam);
|
||||
for (i = 0; i < n_aa[0]; ++i) mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
|
||||
ss[1].sam.l = 0;
|
||||
for (i = 0; i < n_aa[1]; ++i) mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
|
||||
|
|
@ -561,7 +519,9 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
} else
|
||||
goto no_pairing;
|
||||
|
||||
no_pairing:
|
||||
goto end_clear;
|
||||
|
||||
no_pairing:
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int which = -1;
|
||||
if (a[i].n) {
|
||||
|
|
@ -590,11 +550,87 @@ static void workder_gen_sam(void* data, int idx, int tid) {
|
|||
free(h[0].cigar);
|
||||
free(h[1].cigar);
|
||||
|
||||
end_clear:
|
||||
_destory_clear_vec(s[0].msw);
|
||||
_destory_clear_vec(s[1].msw);
|
||||
|
||||
free(a[0].a);
|
||||
free(a[1].a);
|
||||
}
|
||||
|
||||
// 最后再计算并生成sam数据
|
||||
static void workder_gen_sam(void* data, int idx, int tid) {
|
||||
mem_worker_t* w = (mem_worker_t*)data;
|
||||
const mem_opt_t* opt = w->opt;
|
||||
const bntseq_t* bns = w->bns;
|
||||
const uint8_t* pac = w->pac;
|
||||
const mem_pestat_t *pes = w->pes;
|
||||
|
||||
int startIdx = idx * opt->batch_size;
|
||||
int endIdx = (idx + 1) * opt->batch_size;
|
||||
if (endIdx > w->n_reads)
|
||||
endIdx = w->n_reads;
|
||||
|
||||
int id = 0, i = 0, j = 0, k = 0;
|
||||
|
||||
for (id = startIdx; id < endIdx; id += 2) {
|
||||
// 初始化变量
|
||||
bseq1_t* s = &w->seqs[id];
|
||||
mem_alnreg_v* a = &w->regs[id];
|
||||
seq_sam_t* ss = &w->sams[id];
|
||||
|
||||
// int cmp = strcmp("ERR194147.1548", s[0].name);
|
||||
|
||||
int n[2] = {0}, nn[2] = {0};
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int si = !i;
|
||||
int origin_n = a[si].n;
|
||||
// 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起
|
||||
for (k = 0; k < s[si].msw.n; ++k) {
|
||||
msw_task_t* t = s[si].msw.a[k];
|
||||
mem_alnreg_t* b = &a[i].a[t->aj];
|
||||
mem_alnreg_v* ma = &a[si];
|
||||
uint8_t skip = 0;
|
||||
if (n[si]) { // 添加了新的aln,需要检测skip
|
||||
for (j = origin_n; j < ma->n; ++j) { // check which orinentation has been found
|
||||
int64_t dist;
|
||||
int r;
|
||||
r = mem_infer_dir(bns->l_pac, b->rb, ma->a[j].rb, &dist);
|
||||
if (dist >= pes[r].low && dist <= pes[r].high)
|
||||
skip |= 1 << r;
|
||||
}
|
||||
}
|
||||
if ((t->skip | skip) != 15) { // 检查新添加的aln是不是把当前的任务skip掉了
|
||||
kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id +
|
||||
w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
|
||||
nn[si] += 1;
|
||||
n[si] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb);
|
||||
}
|
||||
}
|
||||
}
|
||||
// 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int tmp;
|
||||
if (nn[i] > 0) {
|
||||
for (j = 0; j < n[i]; ++j) { // 在这里排序
|
||||
// move b s.t. ma is sorted
|
||||
int nidx = a[i].n - n[i] + j;
|
||||
mem_alnreg_t b = a[i].a[nidx]; // 用指针一定要慎重,注意指向的地址内容会变
|
||||
for (k = 0; k < nidx; ++k) // find the insertion point
|
||||
if (a[i].a[k].score < b.score)
|
||||
break;
|
||||
tmp = k;
|
||||
for (k = nidx; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
|
||||
a[i].a[k] = b;
|
||||
}
|
||||
|
||||
a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
generate_sam(opt, bns, pac, pes, id >> 1, s, a, ss, w->n_processed, tid);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
6
run.sh
6
run.sh
|
|
@ -1,6 +1,6 @@
|
|||
thread=64
|
||||
thread=1
|
||||
|
||||
make -j 16
|
||||
make clean; make -j 32
|
||||
|
||||
n1=~/data/dataset/D2/n1.fq.gz
|
||||
n2=~/data/dataset/D2/n2.fq.gz
|
||||
|
|
@ -11,7 +11,7 @@ n2=~/data/dataset/D2/n2.fq.gz
|
|||
reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta
|
||||
|
||||
out=/dev/null
|
||||
#out=~/data/D2-out.sam
|
||||
#out=./oldsam-D2-out.sam
|
||||
prog=./fastalign
|
||||
#prog=/home/zzh/fastbwa
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue