From 782449975db11f8390b6738f8155f55e608bc33a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 9 Jul 2017 12:14:20 -0400 Subject: [PATCH] r168: fixed a bug in long join: a[] not sorted Also added length requirement for long join and changed -g in the ava mode --- chain.c | 28 ++++++++++++++++++++++------ hit.c | 4 +++- main.c | 6 +++--- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/chain.c b/chain.c index 8f95cb5..586fc95 100644 --- a/chain.c +++ b/chain.c @@ -23,8 +23,8 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int { // TODO: make sure this works when n has more than 32 bits int32_t st = 0, j, k, *f, *p, *t, *v, n_u, n_v; int64_t i; - uint64_t *u; - mm128_t *b; + uint64_t *u, *u2; + mm128_t *b, *w; if (_u) *_u = 0; f = (int32_t*)kmalloc(km, n * 4); @@ -99,19 +99,35 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int } if (k0 == k) n_v = n_v0; // no new chain added, reset } - n_u = k, *_u = u; + n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here // free kfree(km, f); kfree(km, p); kfree(km, t); - // write the result to _a_ + // write the result to b[] b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t)); for (i = 0, k = 0; i < n_u; ++i) { int32_t k0 = k, ni = (int32_t)u[i]; for (j = 0; j < ni; ++j) b[k] = a[v[k0 + (ni - j - 1)]], ++k; } - memcpy(a, b, n_v * sizeof(mm128_t)); - kfree(km, v); kfree(km, b); + kfree(km, v); + + // sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long) + w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t)); + for (i = k = 0; i < n_u; ++i) { + w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; + k += (int32_t)u[i]; + } + radix_sort_128x(w, w + n_u); + u2 = (uint64_t*)kmalloc(km, n_u * 8); + for (i = k = 0; i < n_u; ++i) { + int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; + u2[i] = u[j]; + memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); + k += n; + } + memcpy(u, u2, n_u * 8); + kfree(km, b); kfree(km, w); kfree(km, u2); return n_u; } diff --git a/hit.c b/hit.c index 7fa3903..c58ebcd 100644 --- a/hit.c +++ b/hit.c @@ -247,8 +247,10 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r max_gap = max_gap > a1s->x - a0e->x? max_gap : a1s->x - a0e->x; min_gap = min_gap < a1s->x - a0e->x? min_gap : a1s->x - a0e->x; if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue; - sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499); // TODO: also require length + sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499); if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains + if (r0->re - r0->rs < max_gap>>1 || r0->qe - r0->qs < max_gap>>1) continue; // require enough flanking length + if (r1->re - r1->rs < max_gap>>1 || r1->qe - r1->qs < max_gap>>1) continue; // all conditions satisfied; join a[r1->as].y |= 1ULL<<40; diff --git a/main.c b/main.c index bee2a89..55cab15 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r167-pre" +#define MM_VERSION "2.0-r168-pre" void liftrlimit() { @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) } else if (c == 'x') { if (strcmp(optarg, "ava10k") == 0) { opt.flag |= MM_F_AVA | MM_F_NO_SELF; - opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.min_seedcov_ratio = 0.05f; + opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.min_seedcov_ratio = 0.05f, opt.max_gap = 10000; is_hpc = 1, k = 19, w = 5; } else if (strcmp(optarg, "map10k") == 0) { is_hpc = 1, k = 19; @@ -155,7 +155,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); fprintf(stderr, " -D FLOAT min fraction of minimizer matches [%g]\n", opt.min_seedcov_ratio); fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); - fprintf(stderr, " ava10k: -Hk19 -w5 -Xp0 -m100 -D.05 (PacBio/ONT all-vs-all read mapping)\n"); + fprintf(stderr, " ava10k: -Hk19 -w5 -Xp0 -m100 -D.05 -g10000 (PacBio/ONT all-vs-all read mapping)\n"); fprintf(stderr, " map10k: -Hk19 (PacBio/ONT vs reference mapping)\n"); fprintf(stderr, " asm1m: -k19 -w19 (intra-species assembly to ref mapping)\n"); fprintf(stderr, " Alignment:\n");