diff --git a/hit.c b/hit.c index 12fbb01..5a608b4 100644 --- a/hit.c +++ b/hit.c @@ -3,6 +3,7 @@ #include #include "mmpriv.h" #include "kalloc.h" +#include "khash.h" static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a) { @@ -36,7 +37,19 @@ static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a) mm_cal_fuzzy_len(r, a); } -mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a) // convert chains to hits +static inline uint64_t hash64(uint64_t key) +{ + key = (~key + (key << 21)); + key = key ^ key >> 24; + key = ((key + (key << 3)) + (key << 8)); + key = key ^ key >> 14; + key = ((key + (key << 2)) + (key << 4)); + key = key ^ key >> 28; + key = (key + (key << 31)); + return key; +} + +mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a) // convert chains to hits { mm128_t *z, tmp; mm_reg1_t *r; @@ -47,7 +60,9 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a) // // sort by score z = (mm128_t*)kmalloc(km, n_u * 16); for (i = k = 0; i < n_u; ++i) { - z[i].x = u[i] >> 32; + uint32_t h; + h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash); + z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain z[i].y = (uint64_t)k << 32 | (int32_t)u[i]; k += (int32_t)u[i]; } @@ -61,7 +76,8 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a) // mm_reg1_t *ri = &r[i]; ri->id = i; ri->parent = MM_PARENT_UNSET; - ri->score = z[i].x; + ri->score = z[i].x >> 32; + ri->hash = (uint32_t)z[i].x; ri->cnt = (int32_t)z[i].y; ri->as = z[i].y >> 32; mm_reg_set_coor(ri, qlen, a); @@ -124,24 +140,25 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r) { int32_t i, n_aux, n = *n_regs; - uint64_t *aux; + mm128_t *aux; mm_reg1_t *t; if (n <= 1) return; - aux = (uint64_t*)kmalloc(km, n * 8); + aux = (mm128_t*)kmalloc(km, n * 16); t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t)); for (i = n_aux = 0; i < n; ++i) { if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted) assert(r[i].p); - aux[n_aux++] = (uint64_t)r[i].p->dp_max << 32 | i; + aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash; + aux[n_aux++].y = i; } else if (r[i].p) { free(r[i].p); r[i].p = 0; } } - radix_sort_64(aux, aux + n_aux); + radix_sort_128x(aux, aux + n_aux); for (i = n_aux - 1; i >= 0; --i) - t[n_aux - 1 - i] = r[(int32_t)aux[i]]; + t[n_aux - 1 - i] = r[aux[i].y]; memcpy(r, t, sizeof(mm_reg1_t) * n_aux); *n_regs = n_aux; kfree(km, aux); @@ -294,7 +311,7 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r } } -mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a) +mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a) { int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0; mm_seg_t *seg; @@ -340,7 +357,7 @@ mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const } } for (s = 0; s < n_segs; ++s) { - regs[s] = mm_gen_regs(km, qlens[s], seg[s].n_u, seg[s].u, seg[s].a); + regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a); n_regs[s] = seg[s].n_u; for (i = 0; i < n_regs[s]; ++i) regs[s][i].seg_split = 1; diff --git a/main.c b/main.c index cf43b51..6b4bcc8 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r460-dirty" +#define MM_VERSION "2.2-r461-dirty" #ifdef __linux__ #include @@ -37,8 +37,9 @@ static struct option long_options[] = { { "cost-non-gt-ag", required_argument, 0, 0 }, { "no-sam-sq", no_argument, 0, 0 }, { "sr", no_argument, 0, 0 }, - { "multi-seg", optional_argument, 0, 0 }, + { "multi", optional_argument, 0, 0 }, { "no-long-join", no_argument, 0, 0 }, + { "seed", required_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -118,7 +119,8 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr else if (c == 0 && long_idx ==15) opt.flag |= MM_F_NO_LJOIN; // --no-long-join - else if (c == 0 && long_idx ==14) { // --multi-seg + else if (c == 0 && long_idx ==16) opt.seed = atoi(optarg); // --seed + else if (c == 0 && long_idx ==14) { // --multi if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) opt.flag |= MM_F_MULTI_SEG; else opt.flag &= ~MM_F_MULTI_SEG; diff --git a/map.c b/map.c index 5492e8c..b6ea3d8 100644 --- a/map.c +++ b/map.c @@ -7,10 +7,12 @@ #include "sdust.h" #include "mmpriv.h" #include "bseq.h" +#include "khash.h" void mm_mapopt_init(mm_mapopt_t *opt) { memset(opt, 0, sizeof(mm_mapopt_t)); + opt->seed = 11; opt->mid_occ_frac = 2e-4f; opt->sdust_thres = 0; // no SDUST masking @@ -243,7 +245,7 @@ static void chain_post(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, opt->max_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); if (!(opt->flag & MM_F_SPLICE) && !(opt->flag & MM_F_SR) && !(opt->flag & MM_F_NO_LJOIN)) - mm_join_long(km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle + mm_join_long(km, opt, qlen, n_regs, regs, a); } } @@ -262,6 +264,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { int i, j, max_gap_ref, rep_len, qlen_sum, n_regs0, rechain = 0; + uint32_t hash; int64_t n_a; uint64_t *u; mm128_t *a; @@ -272,6 +275,10 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char * if (qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return; + hash = qname? __ac_X31_hash_string(qname) : 0; + hash ^= __ac_Wang_hash(qlen_sum) + __ac_Wang_hash(opt->seed); + hash = __ac_Wang_hash(hash); + collect_minimizers(opt, mi, n_segs, qlens, seqs, b); a = collect_seed_hits(opt, opt->mid_occ, mi, qname, qlen_sum, &n_a, &rep_len, b); radix_sort_128x(a, a + n_a); @@ -308,7 +315,7 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char * } } - regs0 = mm_gen_regs(b->km, qlen_sum, n_regs0, u, a); + regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a); if (mm_dbg_flag & MM_DBG_PRINT_SEED) for (j = 0; j < n_regs0; ++j) @@ -324,7 +331,7 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char * n_regs[0] = n_regs0, regs[0] = regs0; } else { mm_seg_t *seg; - seg = mm_seg_gen(b->km, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); + seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); free(regs0); for (i = 0; i < n_segs; ++i) { mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b); diff --git a/minimap.h b/minimap.h index 349c8e9..ef93024 100644 --- a/minimap.h +++ b/minimap.h @@ -69,7 +69,7 @@ typedef struct { int32_t as; // offset in the a[] array (for internal uses only) int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate) uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings - uint64_t hash; + uint32_t hash, dummy; mm_extra_t *p; } mm_reg1_t; @@ -81,6 +81,7 @@ typedef struct { } mm_idxopt_t; typedef struct { + int seed; int sdust_thres; // score threshold for SDUST; 0 to disable int flag; // see MM_F_* macros diff --git a/mmpriv.h b/mmpriv.h index 5e93fd6..6806a6b 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -66,7 +66,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); -mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a); +mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); int mm_set_sam_pri(int n, mm_reg1_t *r); @@ -78,7 +78,7 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_re void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len); -mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a); +mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a); void mm_seg_free(void *km, int n_segs, mm_seg_t *segs); void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs); diff --git a/pe.c b/pe.c index bdf16b1..827be83 100644 --- a/pe.c +++ b/pe.c @@ -94,15 +94,15 @@ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc q = a[last[a[i].rev]].r; if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue; for (j = last[a[i].rev]; j >= 0; --j) { - int64_t sum; + int64_t score; if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue; q = a[j].r; if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break; if (r->p->dp_max + q->p->dp_max < dp_thres) continue; - sum = r->p->dp_max + q->p->dp_max; - if (sum > max) - max = sum, max_idx[a[j].s] = j, max_idx[a[i].s] = i; - kv_push(uint64_t, km, sc, sum); + score = (int64_t)(r->p->dp_max + q->p->dp_max) | (r->hash + q->hash); + if (score > max) + max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i; + kv_push(uint64_t, km, sc, score); } } else { // forward first read or reverse second read last[a[i].rev] = i; @@ -138,7 +138,7 @@ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc if (sc.n == 1) { if (r[0]->mapq < 2) r[0]->mapq = 2; if (r[1]->mapq < 2) r[1]->mapq = 2; - } else if (max > sc.a[sc.n - 2]) { + } else if (max>>32 > sc.a[sc.n - 2]>>32) { if (r[0]->mapq < 1) r[0]->mapq = 1; if (r[1]->mapq < 1) r[1]->mapq = 1; }