r212: better heuristic to fix wrong seeding

but not good enough. Will explore more.
This commit is contained in:
Heng Li 2017-07-27 11:24:51 -04:00
parent 371e20cc7c
commit b927838495
3 changed files with 24 additions and 12 deletions

24
align.c
View File

@ -253,16 +253,20 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
for (i = 1; i < cnt1; ++i) { // gap filling
int gap, to_patch = 0;
mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
gap = (qe - qs) - (re - rs);
if (gap < -40 || gap > 40) {
int g, j, i_next = -1;
for (j = i + 1; j < cnt1; ++j) {
if ((int32_t)a[as1 + j].y - qs > opt->max_gap || (int32_t)a[as1 + j].x - rs > opt->max_gap) break;
g = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (a[as1 + j].x - a[as1 + j - 1].x);
if (abs(gap + g) < 0.5f * abs(gap - g)) {
i_next = j;
break;
}
gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (a[as1 + i].x - a[as1 + i - 1].x);
if (gap < -20 || gap > 20) {
int i_next = -1, j, n_ins = 0, n_del = 0, diff, max_diff = 0;
if (gap < 0) n_del += -gap;
else n_ins += gap;
for (j = i + 1; j < cnt1 && j - i < 20; ++j) {
if ((int32_t)a[as1 + j].y - qs > opt->max_gap>>1 || (int32_t)a[as1 + j].x - rs > opt->max_gap>>1) break;
gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (a[as1 + j].x - a[as1 + j - 1].x);
if (gap < 0) n_del += -gap;
else n_ins += gap;
diff = n_ins + n_del - abs(n_ins - n_del);
if (diff >= 40) {
if (diff >= max_diff) max_diff = diff, i_next = j;
} else if (i_next > i) break;
}
if (i_next > i) {
i = i_next, to_patch = 1;

2
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r211-dirty"
#define MM_VERSION "2.0-r212-dirty"
void liftrlimit()
{

10
map.c
View File

@ -237,11 +237,19 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
if (mm_dbg_flag & MM_DBG_PRINT_SEED)
for (i = 0; i < n_a; ++i)
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff));
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
regs = mm_gen_regs(b->km, qlen, n_u, u, a);
*n_regs = n_u;
if (mm_dbg_flag & MM_DBG_PRINT_SEED)
for (j = 0; j < n_u; ++j)
for (i = regs[j].as; i < regs[j].as + regs[j].cnt; ++i)
fprintf(stderr, "CN\t%d\t%s\t%d\t%c\t%d\t%d\t%d\n", j, mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
i == regs[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);