diff --git a/Makefile b/Makefile index 00de261..be7c931 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat -Wno-unused-function CPPFLAGS= -DHAVE_KALLOC INCLUDES= -I. -OBJS= kalloc.o kthread.o misc.o bseq.o sketch.o chain.o align.o sdust.o \ +OBJS= kalloc.o kthread.o misc.o bseq.o sketch.o chain.o align.o hit.o sdust.o \ index.o format.o map.o ksw2_extz2_sse.o PROG= minimap2 PROG_EXTRA= sdust @@ -42,6 +42,7 @@ align.o: minimap.h mmpriv.h bseq.h ksw2.h bseq.o: bseq.h kseq.h chain.o: minimap.h mmpriv.h bseq.h kalloc.h format.o: mmpriv.h minimap.h bseq.h +hit.o: mmpriv.h minimap.h bseq.h kalloc.h index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h kalloc.o: kalloc.h ksw2_extz2_sse.o: ksw2.h diff --git a/hit.c b/hit.c new file mode 100644 index 0000000..2df04a4 --- /dev/null +++ b/hit.c @@ -0,0 +1,156 @@ +#include +#include +#include "mmpriv.h" +#include "kalloc.h" + +mm_reg1_t *mm_gen_regs(int qlen, int n_u, uint64_t *u, mm128_t *a) // convert chains to hits +{ + mm_reg1_t *r; + int i, k; + r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t)); + for (i = k = 0; i < n_u; ++i) { + mm_reg1_t *ri = &r[i]; + ri->id = i; + ri->parent = MM_PARENT_UNSET; + ri->subsc = 0; + ri->score = u[i]>>32; + ri->cnt = (int32_t)u[i]; + ri->as = k; + mm_reg_set_coor(ri, qlen, a); + k += ri->cnt; + } + return r; +} + +void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compute mm_reg1_t::subsc +{ + int i, j, k, *w; + if (n <= 0) return; + w = (int*)kmalloc(km, n * sizeof(int)); + w[0] = 0, r[0].parent = 0; + for (i = 1, k = 1; i < n; ++i) { + int si = r[i].qs, ei = r[i].qe; + for (j = 0; j < k; ++j) { + int sj = r[w[j]].qs, ej = r[w[j]].qe; + int min = ej - sj < ei - si? ej - sj : ei - si; + int ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); + if (ol > mask_level * min) { + r[i].parent = r[w[j]].parent; + if (r[w[j]].subsc < r[i].score) + r[w[j]].subsc = r[i].score; + break; + } + } + if (j == k) w[k++] = i, r[i].parent = i; + } + kfree(km, w); +} + +void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs) // keep mm_reg1_t::{id,parent} in sync; also reset id +{ + int *tmp, i, max_id = -1, n_tmp; + if (n_regs <= 0) return; + for (i = 0; i < n_regs; ++i) + max_id = max_id > regs[i].id? max_id : regs[i].id; + n_tmp = max_id + 1; + tmp = (int*)kmalloc(km, n_tmp * sizeof(int)); + for (i = 0; i < n_tmp; ++i) tmp[i] = -1; + for (i = 0; i < n_regs; ++i) + if (regs[i].id >= 0) tmp[regs[i].id] = i; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + r->id = i; + if (r->parent == MM_PARENT_TMP_PRI) + r->parent = i; + else if (r->parent >= 0 && tmp[r->parent] >= 0) + r->parent = tmp[r->parent]; + else r->parent = MM_PARENT_UNSET; + } + kfree(km, tmp); +} + +void mm_select_sub(void *km, float mask_level, float pri_ratio, int *n_, mm_reg1_t *r) +{ + if (pri_ratio > 0.0f && *n_ > 0) { + int i, k, n = *n_; + mm_set_parent(km, mask_level, n, r); + for (i = k = 0; i < n; ++i) + if (r[i].parent == i || r[i].score >= r[r[i].parent].score * pri_ratio) + r[k++] = r[i]; + if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync() + *n_ = k; + } +} + +int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a) +{ // squeeze out regions in a[] that are not referenced by regs[] + int i, n_aux, as = 0; + uint64_t *aux; + aux = (uint64_t*)kmalloc(km, n_regs * 8); + for (i = n_aux = 0; i < n_regs; ++i) + aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + if (r->as != as) { + memmove(&a[as], &a[r->as], r->cnt * 16); + r->as = as; + } + as += r->cnt; + } + kfree(km, aux); + return as; +} + +void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a) +{ + int i, n_aux; + uint64_t *aux; + + if (n_regs < 2) return; // nothing to join + mm_squeeze_a(km, n_regs, regs, a); + + aux = (uint64_t*)kmalloc(km, n_regs * 8); + for (i = n_aux = 0; i < n_regs; ++i) + if (regs[i].parent == i || regs[i].parent < 0) + aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; + radix_sort_64(aux, aux + n_aux); + + for (i = n_aux - 1; i >= 1; --i) { + mm_reg1_t *r0 = ®s[(int32_t)aux[i-1]], *r1 = ®s[(int32_t)aux[i]]; + mm128_t *a0e, *a1s; + int max_gap, min_gap; + + // test + if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[] + if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand + if (r0->score < opt->min_join_flank_sc || r1->score < opt->min_join_flank_sc) continue; // require good flanking chains + a0e = &a[r0->as + r0->cnt - 1]; + a1s = &a[r1->as]; + if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity + max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y; + max_gap = max_gap > a1s->x - a0e->x? max_gap : a1s->x - a0e->x; + min_gap = min_gap < a1s->x - a0e->x? min_gap : a1s->x - a0e->x; + if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue; + + // all conditions satisfied; join + a[r1->as].y |= 1ULL<<40; + r0->cnt += r1->cnt, r0->score += r1->score; + mm_reg_set_coor(r0, qlen, a); + r1->cnt = 0; + } + kfree(km, aux); +} + +void mm_set_mapq(int n_regs, mm_reg1_t *regs) +{ + int i; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + if (r->parent == r->id) { + int mapq; + mapq = (int)(30.0 * (1. - (float)r->subsc / r->score) * logf(r->score)); + mapq = mapq > 0? mapq : 0; + r->mapq = mapq < 60? mapq : 60; + } else r->mapq = 0; + } +} diff --git a/map.c b/map.c index 815c8da..713ddff 100644 --- a/map.c +++ b/map.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include "kthread.h" #include "kvec.h" #include "kalloc.h" @@ -164,137 +164,6 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2) // printf("%d,%d; %d,%d\n", m[0]->qpos>>1, m[1]->qpos>>1, m[0]->n, m[1]->n); } #endif -mm_reg1_t *mm_gen_reg(int qlen, int n_u, uint64_t *u, mm128_t *a) -{ - mm_reg1_t *r; - int i, k; - r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t)); - for (i = k = 0; i < n_u; ++i) { - mm_reg1_t *ri = &r[i]; - ri->id = i; - ri->parent = -1, ri->subsc = 0; - ri->score = u[i]>>32; - ri->cnt = (int32_t)u[i]; - ri->as = k; - mm_reg_set_coor(ri, qlen, a); - k += ri->cnt; - } - return r; -} - -static void mm_set_subsc(float mask_level, int n, mm_reg1_t *r, void *km) -{ - int i, j, k, *w; - if (n <= 0) return; - w = (int*)kmalloc(km, n * sizeof(int)); - w[0] = 0, r[0].parent = 0; - for (i = 1, k = 1; i < n; ++i) { - int si = r[i].qs, ei = r[i].qe; - for (j = 0; j < k; ++j) { - int sj = r[w[j]].qs, ej = r[w[j]].qe; - int min = ej - sj < ei - si? ej - sj : ei - si; - int ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); - if (ol > mask_level * min) { - r[i].parent = r[w[j]].parent; - if (r[w[j]].subsc < r[i].score) - r[w[j]].subsc = r[i].score; - break; - } - } - if (j == k) w[k++] = i, r[i].parent = i; - } - kfree(km, w); -} - -void mm_select_sub(float mask_level, float pri_ratio, int *n_, mm_reg1_t *r, void *km) -{ - if (pri_ratio > 0.0f && *n_ > 0) { - int i, k, *tmp, n = *n_; - mm_set_subsc(mask_level, n, r, km); - tmp = (int*)kmalloc(km, n * sizeof(int)); - for (i = 0; i < n; ++i) tmp[i] = -1; - for (i = k = 0; i < n; ++i) - if (r[i].parent == i || r[i].score >= r[r[i].parent].score * pri_ratio) - tmp[i] = k, r[k++] = r[i]; - n = k; - for (i = 0; i < n; ++i) // remap mm_reg1_t::parent, as some mappings may have been dropped - if (tmp[r[i].parent] >= 0) - r[i].parent = tmp[r[i].parent]; - kfree(km, tmp); - *n_ = n; - for (i = 0; i < n; ++i) r[i].id = i; // reset mm_reg1_t::id - } -} - -static int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a) -{ // squeeze out regions in a[] that are not referenced by regs[] - int i, n_aux, as = 0; - uint64_t *aux; - aux = (uint64_t*)kmalloc(km, n_regs * 8); - for (i = n_aux = 0; i < n_regs; ++i) - aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; - for (i = 0; i < n_regs; ++i) { - mm_reg1_t *r = ®s[i]; - if (r->as != as) { - memmove(&a[as], &a[r->as], r->cnt * 16); - r->as = as; - } - as += r->cnt; - } - kfree(km, aux); - return as; -} - -void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a) -{ - int i, n_aux; - uint64_t *aux; - - if (n_regs < 2) return; // nothing to join - mm_squeeze_a(km, n_regs, regs, a); - - aux = (uint64_t*)kmalloc(km, n_regs * 8); - for (i = n_aux = 0; i < n_regs; ++i) - if (regs[i].parent == i || regs[i].parent < 0) - aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; - radix_sort_64(aux, aux + n_aux); - - for (i = n_aux - 1; i >= 1; --i) { - mm_reg1_t *r0 = ®s[(int32_t)aux[i-1]], *r1 = ®s[(int32_t)aux[i]]; - mm128_t *a0e, *a1s; - int max_gap, min_gap; - - // test - if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[] - if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand - if (r0->score < opt->min_join_flank_sc || r1->score < opt->min_join_flank_sc) continue; // require good flanking chains - a0e = &a[r0->as + r0->cnt - 1]; - a1s = &a[r1->as]; - if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity - max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y; - max_gap = max_gap > a1s->x - a0e->x? max_gap : a1s->x - a0e->x; - min_gap = min_gap < a1s->x - a0e->x? min_gap : a1s->x - a0e->x; - if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue; - - // all conditions satisfied; join - a[r1->as].y |= 1ULL<<40; - r0->cnt += r1->cnt, r0->score += r1->score; - mm_reg_set_coor(r0, qlen, a); - r1->cnt = 0; - } - kfree(km, aux); -} - -void mm_set_mapq(mm_reg1_t *r, int which) -{ - if (r->parent == which) { - int mapq; - mapq = (int)(30.0 * (1. - (float)r->subsc / r->score) * logf(r->score)); - mapq = mapq > 0? mapq : 0; - r->mapq = mapq < 60? mapq : 60; - } else r->mapq = 0; -} - mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs) { int i, n = m_en - m_st, j, n_u; @@ -366,15 +235,15 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, kfree(b->km, m); n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km); - regs = mm_gen_reg(qlen, n_u, u, a); + regs = mm_gen_regs(qlen, n_u, u, a); *n_regs = n_u; if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap - mm_select_sub(opt->mask_level, opt->pri_ratio, n_regs, regs, b->km); + mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, n_regs, regs); mm_join_long(b->km, opt, qlen, *n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } if (opt->flag & MM_F_CIGAR) regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); - for (i = 0; i < *n_regs; ++i) mm_set_mapq(®s[i], i); + mm_set_mapq(*n_regs, regs); // free kfree(b->km, a); diff --git a/mmpriv.h b/mmpriv.h index 4680d52..5b6ec57 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -5,6 +5,9 @@ #include "minimap.h" #include "bseq.h" +#define MM_PARENT_UNSET (-1) +#define MM_PARENT_TMP_PRI (-2) + #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif @@ -33,6 +36,12 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_re int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); +mm_reg1_t *mm_gen_regs(int qlen, int n_u, uint64_t *u, mm128_t *a); +void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); +void mm_select_sub(void *km, float mask_level, float pri_ratio, int *n_, mm_reg1_t *r); +void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a); +void mm_set_mapq(int n_regs, mm_reg1_t *regs); + #ifdef __cplusplus } #endif