修正了几个bug,但是目前结果还是不一致,还得继续调试

This commit is contained in:
zzh 2026-01-16 02:54:10 +08:00
parent e997699a47
commit fb0cd0f663
4 changed files with 196 additions and 159 deletions

View File

@ -1,5 +1,5 @@
CC= gcc
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw #-O3
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF

View File

@ -16,7 +16,8 @@
// 需要做mate sw的read
typedef struct {
int is_rev; // seq是否在反向互补链上
uint8_t is_rev; // seq是否在反向互补链上
uint8_t skip; // 记录任务创建时候的skip状态
int xtra;
int64_t rb, re; // ref的起始截止位置左闭右开
int64_t seq_id; // 对应的当前数据块的seq id

View File

@ -93,7 +93,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui
return; // consistent pair exist; no need to perform SW
int task_order = 0;
for (r = 0; r < 4; ++r) {
int is_rev, is_larger;
uint8_t is_rev, is_larger;
int64_t rb, re; // 左闭右开
if (skip[r])
continue;
@ -124,6 +124,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui
else
p = kv_pushp(msw_task_t, *msw16);
p->is_rev = is_rev;
p->skip = skip[0] | skip[1] << 1 | skip[2] << 2 | skip[3] << 3;
p->xtra = xtra;
p->rb = rb;
p->re = re;
@ -396,6 +397,167 @@ static int check_add_align(kswr_avx_t aln, int min_seed_len, int is_rev, int64_t
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
// 根据比对结果生成sam
void generate_sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2],
seq_sam_t ss[2], int64_t n_processed, int tid) {
int i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
kstring_t str;
mem_aln_t h[2], g[2], aa[2][2];
// int cmp = strcmp("ERR194147.17699", s[0].name);
str.l = str.m = 0;
str.s = 0;
memset(h, 0, sizeof(mem_aln_t) * 2);
memset(g, 0, sizeof(mem_aln_t) * 2);
n_aa[0] = n_aa[1] = 0;
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0);
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1);
if (opt->flag & MEM_F_PRIMARY5) {
mem_reorder_primary5(opt->T, &a[0]);
mem_reorder_primary5(opt->T, &a[1]);
}
if (opt->flag & MEM_F_NOPAIRING)
goto no_pairing;
// pairing single-end hits
if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
int is_multi[2], q_pe, score_un, q_se[2];
char** XA[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
for (j = 1; j < n_pri[i]; ++j)
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T)
break;
is_multi[i] = j < n_pri[i] ? 1 : 0;
}
if (is_multi[0] || is_multi[1])
goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
// q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un ? subo : score_un;
q_pe = raw_mapq(o - subo, opt->a);
if (n_sub > 0)
q_pe -= (int)(4.343 * log(n_sub + 1) + .499);
if (q_pe < 0)
q_pe = 0;
if (q_pe > 60)
q_pe = 60;
q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
// the following assumes no split hits
if (o > score_un) { // paired alignment is preferred
mem_alnreg_t* c[2];
c[0] = &a[0].a[z[0]];
c[1] = &a[1].a[z[1]];
for (i = 0; i < 2; ++i) {
if (c[i]->secondary >= 0)
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
q_se[i] = mem_approx_mapq_se(opt, c[i]);
}
q_se[0] = q_se[0] > q_pe ? q_se[0] : q_pe < q_se[0] + 40 ? q_pe : q_se[0] + 40;
q_se[1] = q_se[1] > q_pe ? q_se[1] : q_pe < q_se[1] + 40 ? q_pe : q_se[1] + 40;
extra_flag |= 2;
// cap at the tandem repeat score
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a) ? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a) ? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
} else { // the unpaired alignment is preferred
z[0] = z[1] = 0;
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
}
for (i = 0; i < 2; ++i) {
int k = a[i].a[z[i]].secondary_all;
if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
assert(a[i].a[k].secondary_all < 0);
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].secondary_all == k || j == k)
a[i].a[j].secondary_all = z[i];
a[i].a[z[i]].secondary_all = -1;
}
}
if (!(opt->flag & MEM_F_ALL)) {
for (i = 0; i < 2; ++i) XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
} else
XA[0] = XA[1] = 0;
// write SAM
for (i = 0; i < 2; ++i) {
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
h[i].mapq = q_se[i];
h[i].flag |= 0x40 << i | extra_flag;
h[i].XA = XA[i] ? XA[i][z[i]] : 0;
aa[i][n_aa[i]++] = h[i];
if (n_pri[i] < a[i].n) { // the read has ALT hits
mem_alnreg_t* p = &a[i].a[n_pri[i]];
if (p->score < opt->T || p->secondary >= 0 || !p->is_alt)
continue;
g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
g[i].flag |= 0x800 | 0x40 << i | extra_flag;
g[i].XA = XA[i] ? XA[i][n_pri[i]] : 0;
aa[i][n_aa[i]++] = g[i];
}
}
ss[0].sam.l = 0;
// PROF_START(aln2sam);
for (i = 0; i < n_aa[0]; ++i) mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
ss[1].sam.l = 0;
for (i = 0; i < n_aa[1]; ++i) mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
if (strcmp(s[0].name, s[1].name) != 0)
err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
// free
for (i = 0; i < 2; ++i) {
free(h[i].cigar);
free(g[i].cigar);
if (XA[i] == 0)
continue;
for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
free(XA[i]);
}
} else
goto no_pairing;
goto end_clear;
no_pairing:
for (i = 0; i < 2; ++i) {
int which = -1;
if (a[i].n) {
if (a[i].a[0].score >= opt->T)
which = 0;
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
which = n_pri[i];
}
if (which >= 0)
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
else
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
}
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid &&
h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
int64_t dist;
int d;
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high)
extra_flag |= 2;
}
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]);
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]);
if (strcmp(s[0].name, s[1].name) != 0)
err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar);
free(h[1].cigar);
end_clear:
_destory_clear_vec(s[0].msw);
_destory_clear_vec(s[1].msw);
free(a[0].a);
free(a[1].a);
}
// 最后再计算并生成sam数据
static void workder_gen_sam(void* data, int idx, int tid) {
mem_worker_t* w = (mem_worker_t*)data;
@ -416,37 +578,50 @@ static void workder_gen_sam(void* data, int idx, int tid) {
bseq1_t* s = &w->seqs[id];
mem_alnreg_v* a = &w->regs[id];
seq_sam_t* ss = &w->sams[id];
int z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
mem_aln_t h[2], g[2], aa[2][2];
memset(h, 0, sizeof(mem_aln_t) * 2);
memset(g, 0, sizeof(mem_aln_t) * 2);
n_aa[0] = n_aa[1] = 0;
int n[2] = {0};
// int cmp = strcmp("ERR194147.1548", s[0].name);
int n[2] = {0}, nn[2] = {0};
for (i = 0; i < 2; ++i) {
int si = !i;
int origin_n = a[si].n;
// 这里应该先给task排序因为u8和i16是分开排序的需要合在一起
for (k = 0; k < s[si].msw.n; ++k) {
msw_task_t* t = s[si].msw.a[k];
// kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
n[i] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, &a[!si].a[t->aj], s[si].l_seq, (uint8_t*)s[si].seq, &a[si], t->rb);
mem_alnreg_t* b = &a[i].a[t->aj];
mem_alnreg_v* ma = &a[si];
uint8_t skip = 0;
if (n[si]) { // 添加了新的aln需要检测skip
for (j = origin_n; j < ma->n; ++j) { // check which orinentation has been found
int64_t dist;
int r;
r = mem_infer_dir(bns->l_pac, b->rb, ma->a[j].rb, &dist);
if (dist >= pes[r].low && dist <= pes[r].high)
skip |= 1 << r;
}
}
if ((t->skip | skip) != 15) { // 检查新添加的aln是不是把当前的任务skip掉了
kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id +
w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
nn[si] += 1;
n[si] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb);
}
}
}
// 处理完2个pair read之后再排序因为上面操作有插入排序后之前记录的索引就无效了所以上面不能排序要两个pair read处理完之后再排序
for (i = 0; i < 2; ++i) {
int tmp;
if (n[i] > 0) {
if (nn[i] > 0) {
for (j = 0; j < n[i]; ++j) { // 在这里排序
// move b s.t. ma is sorted
int nidx = a[i].n - n[i] + j;
mem_alnreg_t* b = &a[i].a[nidx];
mem_alnreg_t b = a[i].a[nidx]; // 用指针一定要慎重,注意指向的地址内容会变
for (k = 0; k < nidx; ++k) // find the insertion point
if (a[i].a[k].score < b->score)
if (a[i].a[k].score < b.score)
break;
tmp = k;
for (k = nidx; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
a[i].a[k] = *b;
a[i].a[k] = b;
}
a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
@ -454,147 +629,8 @@ static void workder_gen_sam(void* data, int idx, int tid) {
}
////////////////////////////////////////////////////////////
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0);
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1);
if (opt->flag & MEM_F_PRIMARY5) {
mem_reorder_primary5(opt->T, &a[0]);
mem_reorder_primary5(opt->T, &a[1]);
}
if (opt->flag & MEM_F_NOPAIRING)
goto no_pairing;
// pairing single-end hits
if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
int is_multi[2], q_pe, score_un, q_se[2];
char** XA[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
for (j = 1; j < n_pri[i]; ++j)
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T)
break;
is_multi[i] = j < n_pri[i] ? 1 : 0;
}
if (is_multi[0] || is_multi[1])
goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
// q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un ? subo : score_un;
q_pe = raw_mapq(o - subo, opt->a);
if (n_sub > 0)
q_pe -= (int)(4.343 * log(n_sub + 1) + .499);
if (q_pe < 0)
q_pe = 0;
if (q_pe > 60)
q_pe = 60;
q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
// the following assumes no split hits
if (o > score_un) { // paired alignment is preferred
mem_alnreg_t* c[2];
c[0] = &a[0].a[z[0]];
c[1] = &a[1].a[z[1]];
for (i = 0; i < 2; ++i) {
if (c[i]->secondary >= 0)
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
q_se[i] = mem_approx_mapq_se(opt, c[i]);
}
q_se[0] = q_se[0] > q_pe ? q_se[0] : q_pe < q_se[0] + 40 ? q_pe : q_se[0] + 40;
q_se[1] = q_se[1] > q_pe ? q_se[1] : q_pe < q_se[1] + 40 ? q_pe : q_se[1] + 40;
extra_flag |= 2;
// cap at the tandem repeat score
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a) ? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a) ? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
} else { // the unpaired alignment is preferred
z[0] = z[1] = 0;
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
}
for (i = 0; i < 2; ++i) {
int k = a[i].a[z[i]].secondary_all;
if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
assert(a[i].a[k].secondary_all < 0);
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].secondary_all == k || j == k)
a[i].a[j].secondary_all = z[i];
a[i].a[z[i]].secondary_all = -1;
}
}
if (!(opt->flag & MEM_F_ALL)) {
for (i = 0; i < 2; ++i) XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
} else
XA[0] = XA[1] = 0;
// write SAM
for (i = 0; i < 2; ++i) {
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
h[i].mapq = q_se[i];
h[i].flag |= 0x40 << i | extra_flag;
h[i].XA = XA[i] ? XA[i][z[i]] : 0;
aa[i][n_aa[i]++] = h[i];
if (n_pri[i] < a[i].n) { // the read has ALT hits
mem_alnreg_t* p = &a[i].a[n_pri[i]];
if (p->score < opt->T || p->secondary >= 0 || !p->is_alt)
continue;
g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
g[i].flag |= 0x800 | 0x40 << i | extra_flag;
g[i].XA = XA[i] ? XA[i][n_pri[i]] : 0;
aa[i][n_aa[i]++] = g[i];
}
}
ss[0].sam.l = 0;
//PROF_START(aln2sam);
for (i = 0; i < n_aa[0]; ++i) mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
ss[1].sam.l = 0;
for (i = 0; i < n_aa[1]; ++i) mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
if (strcmp(s[0].name, s[1].name) != 0)
err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
// free
for (i = 0; i < 2; ++i) {
free(h[i].cigar);
free(g[i].cigar);
if (XA[i] == 0)
continue;
for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
free(XA[i]);
}
} else
goto no_pairing;
no_pairing:
for (i = 0; i < 2; ++i) {
int which = -1;
if (a[i].n) {
if (a[i].a[0].score >= opt->T)
which = 0;
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
which = n_pri[i];
}
if (which >= 0)
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
else
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
}
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid &&
h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
int64_t dist;
int d;
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high)
extra_flag |= 2;
}
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]);
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]);
if (strcmp(s[0].name, s[1].name) != 0)
err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar);
free(h[1].cigar);
_destory_clear_vec(s[0].msw);
_destory_clear_vec(s[1].msw);
free(a[0].a);
free(a[1].a);
generate_sam(opt, bns, pac, pes, id >> 1, s, a, ss, w->n_processed, tid);
}
}

6
run.sh
View File

@ -1,6 +1,6 @@
thread=64
thread=1
make -j 16
make clean; make -j 32
n1=~/data/dataset/D2/n1.fq.gz
n2=~/data/dataset/D2/n2.fq.gz
@ -11,7 +11,7 @@ n2=~/data/dataset/D2/n2.fq.gz
reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta
out=/dev/null
#out=~/data/D2-out.sam
#out=./oldsam-D2-out.sam
prog=./fastalign
#prog=/home/zzh/fastbwa