diff --git a/Makefile b/Makefile index 5351642..33673f8 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC= gcc -CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw #-O3 +CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF diff --git a/mate_sw.h b/mate_sw.h index 12a42ba..4f6b483 100644 --- a/mate_sw.h +++ b/mate_sw.h @@ -16,7 +16,8 @@ // 需要做mate sw的read typedef struct { - int is_rev; // seq是否在反向互补链上 + uint8_t is_rev; // seq是否在反向互补链上 + uint8_t skip; // 记录任务创建时候的skip状态 int xtra; int64_t rb, re; // ref的起始截止位置,左闭右开 int64_t seq_id; // 对应的当前数据块的seq id diff --git a/paired_sam.c b/paired_sam.c index bff1b8e..43698ec 100644 --- a/paired_sam.c +++ b/paired_sam.c @@ -93,7 +93,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui return; // consistent pair exist; no need to perform SW int task_order = 0; for (r = 0; r < 4; ++r) { - int is_rev, is_larger; + uint8_t is_rev, is_larger; int64_t rb, re; // 左闭右开 if (skip[r]) continue; @@ -124,6 +124,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui else p = kv_pushp(msw_task_t, *msw16); p->is_rev = is_rev; + p->skip = skip[0] | skip[1] << 1 | skip[2] << 2 | skip[3] << 3; p->xtra = xtra; p->rb = rb; p->re = re; @@ -396,6 +397,167 @@ static int check_add_align(kswr_avx_t aln, int min_seed_len, int is_rev, int64_t #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) +// 根据比对结果生成sam +void generate_sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], + seq_sam_t ss[2], int64_t n_processed, int tid) { + + int i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2]; + kstring_t str; + mem_aln_t h[2], g[2], aa[2][2]; + + // int cmp = strcmp("ERR194147.17699", s[0].name); + + str.l = str.m = 0; + str.s = 0; + memset(h, 0, sizeof(mem_aln_t) * 2); + memset(g, 0, sizeof(mem_aln_t) * 2); + n_aa[0] = n_aa[1] = 0; + n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0); + n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1); + if (opt->flag & MEM_F_PRIMARY5) { + mem_reorder_primary5(opt->T, &a[0]); + mem_reorder_primary5(opt->T, &a[1]); + } + if (opt->flag & MEM_F_NOPAIRING) + goto no_pairing; + // pairing single-end hits + if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) { + int is_multi[2], q_pe, score_un, q_se[2]; + char** XA[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < n_pri[i]; ++j) + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) + break; + is_multi[i] = j < n_pri[i] ? 1 : 0; + } + if (is_multi[0] || is_multi[1]) + goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + // q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un ? subo : score_un; + q_pe = raw_mapq(o - subo, opt->a); + if (n_sub > 0) + q_pe -= (int)(4.343 * log(n_sub + 1) + .499); + if (q_pe < 0) + q_pe = 0; + if (q_pe > 60) + q_pe = 60; + q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499); + // the following assumes no split hits + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t* c[2]; + c[0] = &a[0].a[z[0]]; + c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); + } + q_se[0] = q_se[0] > q_pe ? q_se[0] : q_pe < q_se[0] + 40 ? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe ? q_se[1] : q_pe < q_se[1] + 40 ? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a) ? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); + q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a) ? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); + } + for (i = 0; i < 2; ++i) { + int k = a[i].a[z[i]].secondary_all; + if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT + assert(a[i].a[k].secondary_all < 0); + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].secondary_all == k || j == k) + a[i].a[j].secondary_all = z[i]; + a[i].a[z[i]].secondary_all = -1; + } + } + + if (!(opt->flag & MEM_F_ALL)) { + for (i = 0; i < 2; ++i) XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq); + } else + XA[0] = XA[1] = 0; + + // write SAM + + for (i = 0; i < 2; ++i) { + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); + h[i].mapq = q_se[i]; + h[i].flag |= 0x40 << i | extra_flag; + h[i].XA = XA[i] ? XA[i][z[i]] : 0; + aa[i][n_aa[i]++] = h[i]; + if (n_pri[i] < a[i].n) { // the read has ALT hits + mem_alnreg_t* p = &a[i].a[n_pri[i]]; + if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) + continue; + g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p); + g[i].flag |= 0x800 | 0x40 << i | extra_flag; + g[i].XA = XA[i] ? XA[i][n_pri[i]] : 0; + aa[i][n_aa[i]++] = g[i]; + } + } + ss[0].sam.l = 0; + // PROF_START(aln2sam); + for (i = 0; i < n_aa[0]; ++i) mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits + ss[1].sam.l = 0; + for (i = 0; i < n_aa[1]; ++i) mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits + if (strcmp(s[0].name, s[1].name) != 0) + err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + // free + for (i = 0; i < 2; ++i) { + free(h[i].cigar); + free(g[i].cigar); + if (XA[i] == 0) + continue; + for (j = 0; j < a[i].n; ++j) free(XA[i][j]); + free(XA[i]); + } + } else + goto no_pairing; + + goto end_clear; + +no_pairing: + for (i = 0; i < 2; ++i) { + int which = -1; + if (a[i].n) { + if (a[i].a[0].score >= opt->T) + which = 0; + else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T) + which = n_pri[i]; + } + if (which >= 0) + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]); + else + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); + } + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && + h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + int64_t dist; + int d; + d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); + if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) + extra_flag |= 2; + } + mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]); + mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]); + if (strcmp(s[0].name, s[1].name) != 0) + err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + free(h[0].cigar); + free(h[1].cigar); + +end_clear: + _destory_clear_vec(s[0].msw); + _destory_clear_vec(s[1].msw); + + free(a[0].a); + free(a[1].a); +} + // 最后再计算并生成sam数据 static void workder_gen_sam(void* data, int idx, int tid) { mem_worker_t* w = (mem_worker_t*)data; @@ -416,37 +578,50 @@ static void workder_gen_sam(void* data, int idx, int tid) { bseq1_t* s = &w->seqs[id]; mem_alnreg_v* a = &w->regs[id]; seq_sam_t* ss = &w->sams[id]; - int z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2]; - mem_aln_t h[2], g[2], aa[2][2]; - memset(h, 0, sizeof(mem_aln_t) * 2); - memset(g, 0, sizeof(mem_aln_t) * 2); - n_aa[0] = n_aa[1] = 0; - int n[2] = {0}; + // int cmp = strcmp("ERR194147.1548", s[0].name); + + int n[2] = {0}, nn[2] = {0}; for (i = 0; i < 2; ++i) { int si = !i; + int origin_n = a[si].n; // 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起 for (k = 0; k < s[si].msw.n; ++k) { msw_task_t* t = s[si].msw.a[k]; - // kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); - - n[i] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, &a[!si].a[t->aj], s[si].l_seq, (uint8_t*)s[si].seq, &a[si], t->rb); + mem_alnreg_t* b = &a[i].a[t->aj]; + mem_alnreg_v* ma = &a[si]; + uint8_t skip = 0; + if (n[si]) { // 添加了新的aln,需要检测skip + for (j = origin_n; j < ma->n; ++j) { // check which orinentation has been found + int64_t dist; + int r; + r = mem_infer_dir(bns->l_pac, b->rb, ma->a[j].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip |= 1 << r; + } + } + if ((t->skip | skip) != 15) { // 检查新添加的aln是不是把当前的任务skip掉了 + kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); + nn[si] += 1; + n[si] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb); + } } } // 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序 for (i = 0; i < 2; ++i) { int tmp; - if (n[i] > 0) { + if (nn[i] > 0) { for (j = 0; j < n[i]; ++j) { // 在这里排序 // move b s.t. ma is sorted int nidx = a[i].n - n[i] + j; - mem_alnreg_t* b = &a[i].a[nidx]; + mem_alnreg_t b = a[i].a[nidx]; // 用指针一定要慎重,注意指向的地址内容会变 for (k = 0; k < nidx; ++k) // find the insertion point - if (a[i].a[k].score < b->score) + if (a[i].a[k].score < b.score) break; tmp = k; for (k = nidx; k > tmp; --k) a[i].a[k] = a[i].a[k - 1]; - a[i].a[k] = *b; + a[i].a[k] = b; } a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a); @@ -454,147 +629,8 @@ static void workder_gen_sam(void* data, int idx, int tid) { } //////////////////////////////////////////////////////////// - n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0); - n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1); - if (opt->flag & MEM_F_PRIMARY5) { - mem_reorder_primary5(opt->T, &a[0]); - mem_reorder_primary5(opt->T, &a[1]); - } - if (opt->flag & MEM_F_NOPAIRING) - goto no_pairing; - // pairing single-end hits - if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) { - int is_multi[2], q_pe, score_un, q_se[2]; - char** XA[2]; - // check if an end has multiple hits even after mate-SW - for (i = 0; i < 2; ++i) { - for (j = 1; j < n_pri[i]; ++j) - if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) - break; - is_multi[i] = j < n_pri[i] ? 1 : 0; - } - if (is_multi[0] || is_multi[1]) - goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score - // compute mapQ for the best SE hit - score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; - // q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; - subo = subo > score_un ? subo : score_un; - q_pe = raw_mapq(o - subo, opt->a); - if (n_sub > 0) - q_pe -= (int)(4.343 * log(n_sub + 1) + .499); - if (q_pe < 0) - q_pe = 0; - if (q_pe > 60) - q_pe = 60; - q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499); - // the following assumes no split hits - if (o > score_un) { // paired alignment is preferred - mem_alnreg_t* c[2]; - c[0] = &a[0].a[z[0]]; - c[1] = &a[1].a[z[1]]; - for (i = 0; i < 2; ++i) { - if (c[i]->secondary >= 0) - c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; - q_se[i] = mem_approx_mapq_se(opt, c[i]); - } - q_se[0] = q_se[0] > q_pe ? q_se[0] : q_pe < q_se[0] + 40 ? q_pe : q_se[0] + 40; - q_se[1] = q_se[1] > q_pe ? q_se[1] : q_pe < q_se[1] + 40 ? q_pe : q_se[1] + 40; - extra_flag |= 2; - // cap at the tandem repeat score - q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a) ? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); - q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a) ? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); - } else { // the unpaired alignment is preferred - z[0] = z[1] = 0; - q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); - q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); - } - for (i = 0; i < 2; ++i) { - int k = a[i].a[z[i]].secondary_all; - if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT - assert(a[i].a[k].secondary_all < 0); - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].secondary_all == k || j == k) - a[i].a[j].secondary_all = z[i]; - a[i].a[z[i]].secondary_all = -1; - } - } - if (!(opt->flag & MEM_F_ALL)) { - for (i = 0; i < 2; ++i) XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq); - } else - XA[0] = XA[1] = 0; - - // write SAM - - for (i = 0; i < 2; ++i) { - h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); - h[i].mapq = q_se[i]; - h[i].flag |= 0x40 << i | extra_flag; - h[i].XA = XA[i] ? XA[i][z[i]] : 0; - aa[i][n_aa[i]++] = h[i]; - if (n_pri[i] < a[i].n) { // the read has ALT hits - mem_alnreg_t* p = &a[i].a[n_pri[i]]; - if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) - continue; - g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p); - g[i].flag |= 0x800 | 0x40 << i | extra_flag; - g[i].XA = XA[i] ? XA[i][n_pri[i]] : 0; - aa[i][n_aa[i]++] = g[i]; - } - } - ss[0].sam.l = 0; - //PROF_START(aln2sam); - for (i = 0; i < n_aa[0]; ++i) mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits - ss[1].sam.l = 0; - for (i = 0; i < n_aa[1]; ++i) mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits - if (strcmp(s[0].name, s[1].name) != 0) - err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); - // free - for (i = 0; i < 2; ++i) { - free(h[i].cigar); - free(g[i].cigar); - if (XA[i] == 0) - continue; - for (j = 0; j < a[i].n; ++j) free(XA[i][j]); - free(XA[i]); - } - } else - goto no_pairing; - - no_pairing: - for (i = 0; i < 2; ++i) { - int which = -1; - if (a[i].n) { - if (a[i].a[0].score >= opt->T) - which = 0; - else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T) - which = n_pri[i]; - } - if (which >= 0) - h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]); - else - h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); - } - if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && - h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. - int64_t dist; - int d; - d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); - if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) - extra_flag |= 2; - } - mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]); - mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]); - if (strcmp(s[0].name, s[1].name) != 0) - err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); - free(h[0].cigar); - free(h[1].cigar); - - _destory_clear_vec(s[0].msw); - _destory_clear_vec(s[1].msw); - - free(a[0].a); - free(a[1].a); + generate_sam(opt, bns, pac, pes, id >> 1, s, a, ss, w->n_processed, tid); } } diff --git a/run.sh b/run.sh index 3ada1c6..762d4c4 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,6 @@ -thread=64 +thread=1 -make -j 16 +make clean; make -j 32 n1=~/data/dataset/D2/n1.fq.gz n2=~/data/dataset/D2/n2.fq.gz @@ -11,7 +11,7 @@ n2=~/data/dataset/D2/n2.fq.gz reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta out=/dev/null -#out=~/data/D2-out.sam +#out=./oldsam-D2-out.sam prog=./fastalign #prog=/home/zzh/fastbwa