diff --git a/align.c b/align.c index b526a09..d18cfed 100644 --- a/align.c +++ b/align.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "minimap.h" #include "mmpriv.h" #include "ksw2.h" @@ -341,6 +342,48 @@ static void mm_max_stretch(const mm_mapopt_t *opt, const mm_reg1_t *r, const mm1 *as = max_i, *cnt = max_len; } +static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a) +{ + uint8_t *qseq, *tseq; + int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len; + void *qp; + rid = a->x<<1>>33; + re = (uint32_t)a->x + 1, rs = re - q_span; + qe = (uint32_t)a->y + 1, qs = qe - q_span; + rs = rs - ext_len > 0? rs - ext_len : 0; + qs = qs - ext_len > 0? qs - ext_len : 0; + re = re + ext_len < mi->seq[rid].len? re + ext_len : mi->seq[rid].len; + qe = qe + ext_len < qlen? qe + ext_len : qlen; + tseq = (uint8_t*)kmalloc(km, re - rs); + mm_idx_getseq(mi, rid, rs, re, tseq); + qseq = qseq0[a->x>>63] + qs; + qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat); + score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off); + kfree(km, tseq); + kfree(km, qp); + return score; +} + +static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1) +{ // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons + int score; + double log_gap; + *as1 = r->as, *cnt1 = r->cnt; + if (r->cnt < 3) return; + log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x); + if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) { + score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]); + if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift" + ++(*as1), --(*cnt1); + } + log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x); + if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) { + score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]); + if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) + --(*cnt1); + } +} + static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag) { int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); @@ -365,9 +408,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int re = (int32_t)a[as1+cnt1-1].x + 1; qe = (int32_t)a[as1+cnt1-1].y + 1; } else { - if (!is_splice) + if (is_splice) { + mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1); + } else { mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); - else as1 = r->as, cnt1 = r->cnt; + } mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); diff --git a/esterr.c b/esterr.c index e3b6e85..733c762 100644 --- a/esterr.c +++ b/esterr.c @@ -44,7 +44,11 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const r->div = -1.0f; if (r->cnt == 0) continue; st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos); - assert(st >= 0); + if (st < 0) { + if (mm_verbose >= 2) + fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n"); + continue; + } l_ref = mi->seq[r->rid].len; for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) { int32_t x; diff --git a/main.c b/main.c index 8ec4428..f2774ac 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.5-r617-dirty" +#define MM_VERSION "2.5-r618-dirty" #ifdef __linux__ #include @@ -44,6 +44,7 @@ static struct option long_options[] = { { "no-pairing", no_argument, 0, 0 }, { "splice-flank", optional_argument, 0, 0 }, { "idx-no-seq", no_argument, 0, 0 }, + { "end-seed-pen", required_argument, 0, 0 }, // 21 { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -140,6 +141,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus else if (c == 0 && long_idx ==18) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing else if (c == 0 && long_idx ==20) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq + else if (c == 0 && long_idx ==21) opt.anchor_ext_shift = atoi(optarg); // --end-seed-pen else if (c == 0 && long_idx == 14) { // --frag if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) opt.flag |= MM_F_FRAG_MODE; diff --git a/map.c b/map.c index 89bddf2..2bdd830 100644 --- a/map.c +++ b/map.c @@ -36,6 +36,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->end_bonus = -1; opt->min_dp_max = opt->min_chain_score * opt->a; opt->min_ksw_len = 200; + opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6; opt->mini_batch_size = 500000000; opt->pe_ori = 0; // FF diff --git a/minimap.h b/minimap.h index b33ffb3..b2783cb 100644 --- a/minimap.h +++ b/minimap.h @@ -116,6 +116,7 @@ typedef struct { int end_bonus; int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold int min_ksw_len; + int anchor_ext_len, anchor_ext_shift; int pe_ori, pe_bonus; diff --git a/minimap2.1 b/minimap2.1 index 8696bed..b6366fe 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -272,6 +272,18 @@ However, the SIRV control does not honor this trend on SIRV data, please add .B --splice-flank=no to the command line. +.TP +.BI --end-seed-pen \ INT +Drop a terminal anchor if +.IR s