diff --git a/main.c b/main.c index cb85276..dca31d3 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r444-dirty" +#define MM_VERSION "2.2-r446-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index a997f43..ed480a5 100644 --- a/map.c +++ b/map.c @@ -13,7 +13,6 @@ void mm_mapopt_init(mm_mapopt_t *opt) memset(opt, 0, sizeof(mm_mapopt_t)); opt->mid_occ_frac = 2e-4f; opt->sdust_thres = 0; // no SDUST masking - opt->pe_ori = 0; // FF opt->min_cnt = 3; opt->min_chain_score = 40; @@ -35,6 +34,9 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_dp_max = opt->min_chain_score * opt->a; opt->min_ksw_len = 200; opt->mini_batch_size = 200000000; + + opt->pe_ori = 0; // FF + opt->pe_bonus = 33; } void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) @@ -236,7 +238,7 @@ static void chain_post(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); - else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, opt->max_gap_ref, mi->k*2, opt->best_n * n_segs, n_segs, qlens, n_regs, regs); + else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, opt->max_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); if (!(opt->flag & MM_F_SPLICE) && !(opt->flag & MM_F_SR)) mm_join_long(km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } @@ -304,6 +306,8 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char * mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len); } mm_seg_free(b->km, n_segs, seg); + if (n_segs == 2 && opt->pe_ori >= 0) + mm_pair(b->km, max_gap_ref, opt->pe_bonus, qlens, n_regs, regs); } kfree(b->km, a); diff --git a/minimap.h b/minimap.h index 4b2d9a6..5c379e6 100644 --- a/minimap.h +++ b/minimap.h @@ -68,6 +68,7 @@ typedef struct { int32_t as; // offset in the a[] array (for internal uses only) int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate) uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings + uint64_t hash; mm_extra_t *p; } mm_reg1_t; @@ -81,7 +82,6 @@ typedef struct { typedef struct { int sdust_thres; // score threshold for SDUST; 0 to disable int flag; // see MM_F_* macros - int pe_ori; int bw; // bandwidth int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window @@ -102,6 +102,8 @@ typedef struct { int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold int min_ksw_len; + int pe_ori, pe_bonus; + float mid_occ_frac; // only used by mm_mapopt_update(); see below int32_t mid_occ; // ignore seeds with occurrences above this threshold int mini_batch_size; // size of a batch of query bases to process in parallel diff --git a/mmpriv.h b/mmpriv.h index 8208d77..7f9d8d4 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -80,6 +80,7 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, in mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a); void mm_seg_free(void *km, int n_segs, mm_seg_t *segs); +void mm_pair(void *km, int max_gap_ref, int dp_bonus, const int *qlens, int *n_regs, mm_reg1_t **regs); #ifdef __cplusplus } diff --git a/pe.c b/pe.c index 2c79106..90f2e8f 100644 --- a/pe.c +++ b/pe.c @@ -1,5 +1,6 @@ #include #include "mmpriv.h" +#include "kalloc.h" void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r) { @@ -43,29 +44,83 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int #include "ksort.h" typedef struct { - int s; + int s, rev; uint64_t key; mm_reg1_t *r; } pair_arr_t; #define sort_key_pair(a) ((a).key) KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8) -/* -void mm_pair(void *km, int max_gap_ref, int *n_regs, mm_reg1_t **regs) + +void mm_pair(void *km, int max_gap_ref, int pe_bonus, const int *qlens, int *n_regs, mm_reg1_t **regs) { - int i, j, s, n, last = -1; + int i, j, s, n, last[2], dp_thres, segs = 0, n_pairs = 0, max_idx[2]; + int64_t max, max2; pair_arr_t *a; + a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t)); - for (s = n = 0; s < 2; ++s) + for (s = n = 0, dp_thres = 0; s < 2; ++s) { + int max = 0; for (i = 0; i < n_regs[s]; ++i) { a[n].s = s; a[n].r = ®s[s][i]; - a[n++].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs; + a[n].rev = a[n].r->rev; + a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev); + max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max; + ++n; + segs |= 1<rev : a[i].r->rev; + dp_thres += max; } + if (segs != 3) { + kfree(km, a); // only one end is mapped + return; + } + dp_thres -= pe_bonus; + if (dp_thres < 0) dp_thres = 0; + radix_sort_pair(a, a + n); + + max = max2 = -1; + max_idx[0] = max_idx[1] = -1; + last[0] = last[1] = -1; + for (i = 0; i < n; ++i) { + if (a[i].key & 1) { // reverse first read or forward second read + mm_reg1_t *q, *r; + if (last[a[i].rev] < 0) continue; + r = a[i].r; + q = a[last[a[i].rev]].r; + if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue; + for (j = last[a[i].rev]; j >= 0; --j) { + int sum; + if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue; + q = a[j].r; + if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break; + if (r->p->dp_max + q->p->dp_max < dp_thres) continue; + ++n_pairs; + sum = r->p->dp_max + q->p->dp_max; + if (sum > max) { + max_idx[a[j].s] = j, max_idx[a[i].s] = i; + max2 = max; + max = sum; + } else if (sum > max2) + max2 = sum; + } + } else { // forward first read or reverse second read + last[a[i].rev] = i; + } + } + + if (max > 0) { + for (s = 0; s < 2; ++s) { + mm_reg1_t *r = a[max_idx[s]].r; + if (r->id != r->parent) { + mm_reg1_t *p = ®s[s][r->parent]; + for (i = 0; i < n_regs[s]; ++i) + if (regs[s][i].parent == p->id) + regs[s][i].parent = r->id; + } + } + } + kfree(km, a); } -*/