diff --git a/align.c b/align.c index 65b64f5..523c4bb 100644 --- a/align.c +++ b/align.c @@ -237,7 +237,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t r->p = p; } -static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap) +static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int b2) { uint32_t k, l; int32_t qshift, tshift, toff = 0, qoff = 0; @@ -255,7 +255,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts int cq = qseq[qoff + l], ct = tseq[toff + l]; if (ct > 3 || cq > 3) ++n_ambi; else if (ct != cq) ++n_diff; - s += mat[ct * 5 + cq]; + s += b2 <= 0? mat[ct * 5 + cq] : (ct > 3 || cq > 3)? 0 : ct == cq? 1 : -b2; if (s < 0) s = 0; else max = max > s? max : s; } @@ -266,7 +266,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; r->blen += len - n_ambi, p->n_ambi += n_ambi; - if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); + if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len); else s -= q + e * len; if (s < 0) s = 0; qoff += len; @@ -275,7 +275,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; r->blen += len - n_ambi, p->n_ambi += n_ambi; - if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); + if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len); else s -= q + e * len; if (s < 0) s = 0; toff += len; @@ -284,6 +284,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts } } p->dp_max = (int32_t)(max + .499); + if (b2 > 0) p->dp_max *= mat[0]; // for compatibility with mm_set_mapq() assert(qoff == r->qe - r->qs && toff == r->re - r->rs); if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned } @@ -816,7 +817,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs1, re1, tseq); qseq = &qseq0[r->rev][qs1]; } - mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag&MM_F_NO_LOG_GAP)); + mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -875,7 +876,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i } r_inv->rs = r1->re + t_off; r_inv->re = r_inv->rs + ez->max_t + 1; - mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag&MM_F_NO_LOG_GAP)); + mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2); ret = 1; end_align1_inv: kfree(km, tseq); diff --git a/main.c b/main.c index 53656f7..d4ce8b8 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.21-dev-r1076-dirty" +#define MM_VERSION "2.21-dev-r1078-dirty" #ifdef __linux__ #include @@ -73,7 +73,6 @@ static ko_longopt_t long_options[] = { { "mask-len", ko_required_argument, 346 }, { "rmq", ko_optional_argument, 347 }, { "qstrand", ko_no_argument, 348 }, - { "log-gap", ko_required_argument, 349 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -117,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:b:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -183,6 +182,7 @@ int main(int argc, char *argv[]) else if (c == 'R') rg = o.arg; else if (c == 'h') fp_help = stdout; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; + else if (c == 'b') opt.b2 = atoi(o.arg); else if (c == 'o') { if (strcmp(o.arg, "-") != 0) { if (freopen(o.arg, "wb", stdout) == NULL) { @@ -253,8 +253,6 @@ int main(int argc, char *argv[]) yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); } else if (c == 347) { // --rmq yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); - } else if (c == 349) { // --log-gap - yes_or_no(&opt, MM_F_NO_LOG_GAP, o.longidx, o.arg, 0); } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) @@ -331,9 +329,10 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); fprintf(fp_help, " Alignment:\n"); fprintf(fp_help, " -A INT matching score [%d]\n", opt.a); - fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b); + fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b); fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); + fprintf(fp_help, " -b INT ranking penalty (independent of A/B/O/E; 0 to disable) [%d]\n", opt.b2); fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); diff --git a/minimap.h b/minimap.h index f2cdfb4..d87a083 100644 --- a/minimap.h +++ b/minimap.h @@ -39,7 +39,6 @@ #define MM_F_RMQ (0x80000000LL) #define MM_F_QSTRAND (0x100000000LL) #define MM_F_NO_INV (0x200000000LL) -#define MM_F_NO_LOG_GAP (0x400000000LL) #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 @@ -148,6 +147,7 @@ typedef struct { float alt_drop; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties + int b2; // used for re-ranking hits int sc_ambi; // score when one or both bases are "N" int noncan; // cost of non-canonical splicing sites int junc_bonus; diff --git a/minimap2.1 b/minimap2.1 index a1c0fb8..96dd422 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -573,7 +573,7 @@ Up to 20% sequence divergence. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 +.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .BR --splice-flank=yes ). In the splice mode, 1) long deletions are taken as introns and represented as the @@ -592,7 +592,7 @@ Long-read splice alignment for PacBio CCS reads .B sr Short single-end reads without splicing .RB ( -k21 -.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 +.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 .B -s40 -g100 -2K50m --heap-sort=yes .BR --secondary=no ). .TP diff --git a/options.c b/options.c index a567fb2..8da26b6 100644 --- a/options.c +++ b/options.c @@ -43,6 +43,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->alt_drop = 0.15f; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; + opt->b2 = 5; opt->sc_ambi = 1; opt->zdrop = 400, opt->zdrop_inv = 200; opt->end_bonus = -1; @@ -68,6 +69,11 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) opt->mid_occ = opt->max_mid_occ; } + if (opt->b2 > 0 && opt->b2 * opt->a < opt->b) { + opt->b2 = (int)opt->b / opt->a; + if (mm_verbose >= 3) + fprintf(stderr, "[M::%s] addjusted the ranking penalty to %d\n", __func__, opt->b2); + } if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } @@ -125,7 +131,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 21, io->w = 11; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; mo->pe_ori = 0<<1|1; // FR - mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; + mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1, mo->b2 = 0; mo->zdrop = mo->zdrop_inv = 100; mo->end_bonus = 10; mo->max_frag_len = 800; @@ -144,7 +150,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; mo->max_sw_mat = 0; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000; - mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; + mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0, mo->b2 = 0; mo->noncan = 9; mo->junc_bonus = 9; mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved