diff --git a/align.c b/align.c index 523c4bb..fc97368 100644 --- a/align.c +++ b/align.c @@ -237,7 +237,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t r->p = p; } -static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int b2) +static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap) { uint32_t k, l; int32_t qshift, tshift, toff = 0, qoff = 0; @@ -255,7 +255,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts int cq = qseq[qoff + l], ct = tseq[toff + l]; if (ct > 3 || cq > 3) ++n_ambi; else if (ct != cq) ++n_diff; - s += b2 <= 0? mat[ct * 5 + cq] : (ct > 3 || cq > 3)? 0 : ct == cq? 1 : -b2; + s += mat[ct * 5 + cq]; if (s < 0) s = 0; else max = max > s? max : s; } @@ -266,8 +266,8 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; r->blen += len - n_ambi, p->n_ambi += n_ambi; - if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len); - else s -= q + e * len; + if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); + else s -= q + e; if (s < 0) s = 0; qoff += len; } else if (op == MM_CIGAR_DEL) { @@ -275,8 +275,8 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; r->blen += len - n_ambi, p->n_ambi += n_ambi; - if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len); - else s -= q + e * len; + if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); + else s -= q + e; if (s < 0) s = 0; toff += len; } else if (op == MM_CIGAR_N_SKIP) { @@ -284,7 +284,6 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts } } p->dp_max = (int32_t)(max + .499); - if (b2 > 0) p->dp_max *= mat[0]; // for compatibility with mm_set_mapq() assert(qoff == r->qe - r->qs && toff == r->re - r->rs); if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned } @@ -817,7 +816,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs1, re1, tseq); qseq = &qseq0[r->rev][qs1]; } - mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2); + mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR)); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -876,7 +875,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i } r_inv->rs = r1->re + t_off; r_inv->re = r_inv->rs + ez->max_t + 1; - mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2); + mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR)); ret = 1; end_align1_inv: kfree(km, tseq); @@ -893,6 +892,70 @@ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, m return regs; } +static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_) +{ + uint32_t i; + int32_t n_gapo = 0, n_gap = 0; + *n_gap_ = *n_gapo_ = -1; + if (r->p == 0) return; + for (i = 0; i < r->p->n_cigar; ++i) { + int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) + ++n_gapo, n_gap += len; + } + *n_gap_ = n_gap, *n_gapo_ = n_gapo; +} + +double mm_event_identity(const mm_reg1_t *r) +{ + int32_t n_gap, n_gapo; + if (r->p == 0) return -1.0f; + mm_count_gaps(r, &n_gap, &n_gapo); + return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo); +} + +static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc) +{ + uint32_t i; + int32_t n_gap = 0, n_gapo = 0, n_mis; + double gap_cost = 0.0; + if (r->p == 0) return -1; + for (i = 0; i < r->p->n_cigar; ++i) { + int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) { + gap_cost += b2 + (double)mg_log2(1.0 + len); + ++n_gapo, n_gap += len; + } + } + n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap; + return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499); +} + +static void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b) +{ + int32_t max = -1, max2 = -1, i, max_i = -1; + double div, b2; + if (n_regs < 2) return; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + if (r->p == 0) continue; + if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i; + else if (r->p->dp_max > max2) max2 = r->p->dp_max; + } + if (max_i < 0 || max < 0 || max2 < 0) return; + if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return; + if (max2 < (double)max * frac) return; + div = 1. - mm_event_identity(®s[max_i]); + if (div < 0.02) div = 0.02; + b2 = 0.5 / div; // max value: 25 + if (b2 * a < b) b2 = (double)a / b; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + if (r->p == 0) continue; + r->p->dp_max = mm_recal_max_dp(r, b2, a); + } +} + mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a) { extern unsigned char seq_nt4_table[256]; @@ -947,6 +1010,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m kfree(km, qseq0[0]); kfree(km, ez.cigar); mm_filter_regs(opt, qlen, n_regs_, regs); + if (!(opt->flag&MM_F_SR) && qlen >= opt->rank_min_len) + mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b); mm_hit_sort(km, n_regs_, regs, opt->alt_drop); return regs; } diff --git a/format.c b/format.c index d2a538b..2c8d65f 100644 --- a/format.c +++ b/format.c @@ -271,18 +271,6 @@ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_r return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0, 0); } -double mm_event_identity(const mm_reg1_t *r) -{ - int32_t i, n_gapo = 0, n_gap = 0; - if (r->p == 0) return -1.0f; - for (i = 0; i < r->p->n_cigar; ++i) { - int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; - if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) - ++n_gapo, n_gap += len; - } - return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo); -} - static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { int type; diff --git a/main.c b/main.c index d4ce8b8..f4fb078 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.21-dev-r1078-dirty" +#define MM_VERSION "2.21-dev-r1079-dirty" #ifdef __linux__ #include @@ -116,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:b:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -182,7 +182,6 @@ int main(int argc, char *argv[]) else if (c == 'R') rg = o.arg; else if (c == 'h') fp_help = stdout; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; - else if (c == 'b') opt.b2 = atoi(o.arg); else if (c == 'o') { if (strcmp(o.arg, "-") != 0) { if (freopen(o.arg, "wb", stdout) == NULL) { @@ -332,7 +331,6 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b); fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); - fprintf(fp_help, " -b INT ranking penalty (independent of A/B/O/E; 0 to disable) [%d]\n", opt.b2); fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); diff --git a/minimap.h b/minimap.h index d87a083..0b200ad 100644 --- a/minimap.h +++ b/minimap.h @@ -147,7 +147,6 @@ typedef struct { float alt_drop; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties - int b2; // used for re-ranking hits int sc_ambi; // score when one or both bases are "N" int noncan; // cost of non-canonical splicing sites int junc_bonus; @@ -158,6 +157,9 @@ typedef struct { int anchor_ext_len, anchor_ext_shift; float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio + int rank_min_len; + float rank_frac; + int pe_ori, pe_bonus; float mid_occ_frac; // only used by mm_mapopt_update(); see below diff --git a/mmpriv.h b/mmpriv.h index 6d0ed04..cc3f483 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -62,6 +62,7 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); +double mm_event_identity(const mm_reg1_t *r); int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len); diff --git a/options.c b/options.c index 8da26b6..97347be 100644 --- a/options.c +++ b/options.c @@ -43,7 +43,6 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->alt_drop = 0.15f; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; - opt->b2 = 5; opt->sc_ambi = 1; opt->zdrop = 400, opt->zdrop_inv = 200; opt->end_bonus = -1; @@ -54,6 +53,9 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->mini_batch_size = 500000000; opt->max_sw_mat = 100000000; + opt->rank_min_len = 500; + opt->rank_frac = 0.9f; + opt->pe_ori = 0; // FF opt->pe_bonus = 33; } @@ -69,11 +71,6 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) opt->mid_occ = opt->max_mid_occ; } - if (opt->b2 > 0 && opt->b2 * opt->a < opt->b) { - opt->b2 = (int)opt->b / opt->a; - if (mm_verbose >= 3) - fprintf(stderr, "[M::%s] addjusted the ranking penalty to %d\n", __func__, opt->b2); - } if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } @@ -131,7 +128,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 21, io->w = 11; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; mo->pe_ori = 0<<1|1; // FR - mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1, mo->b2 = 0; + mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; mo->zdrop = mo->zdrop_inv = 100; mo->end_bonus = 10; mo->max_frag_len = 800; @@ -150,7 +147,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; mo->max_sw_mat = 0; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000; - mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0, mo->b2 = 0; + mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; mo->junc_bonus = 9; mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved