diff --git a/align.c b/align.c index 9b3e62b..f56be42 100644 --- a/align.c +++ b/align.c @@ -186,7 +186,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // } } -static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez) +static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int flag, ksw_extz_t *ez) { int zdrop = opt->zdrop; if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { @@ -202,7 +202,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, flag, ez); else - ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, flag, ez); + ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez); } static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x) @@ -378,12 +378,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int rs0 = (int32_t)a[r->as].x + 1 - (int32_t)(a[r->as].y>>32&0xff); qs0 = (int32_t)a[r->as].y + 1 - (int32_t)(a[r->as].y>>32&0xff); if (r->as > 0 && a[r->as - 1].x>>32 == a[r->as].x>>32) { - rs1 = (int32_t)a[r->as - 1].x + 1; - qs1 = (int32_t)a[r->as - 1].y + 1; - } else rs1 = qs1 = 0; // no adjacent previous chain on the same chr + rs1 = (int32_t)a[r->as - 1].x + 1 - (int32_t)(a[r->as - 1].y>>32&0xff); + } else rs1 = 0; // no adjacent previous chain on the same chr if (qs > 0 && rs > 0) { // actually this is always true l = qs < max_end_ext? qs : max_end_ext; - qs1 = qs1 > qs - l? qs1 : qs - l; // choose between the max_end_ext bound and the previous seed bound + qs1 = qs - l; qs0 = qs0 < qs1? qs0 : qs1; // at least include qs0 l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0; l = l < max_end_ext? l : max_end_ext; @@ -396,12 +395,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int re0 = (int32_t)a[r->as + r->cnt - 1].x + 1; qe0 = (int32_t)a[r->as + r->cnt - 1].y + 1; if (r->as + r->cnt < n_a && a[r->as + r->cnt].x>>32 == a[r->as + r->cnt - 1].x>>32) { - re1 = (int32_t)a[r->as + r->cnt].x + 1 - (int32_t)(a[r->as + r->cnt].y>>32&0xff); - qe1 = (int32_t)a[r->as + r->cnt].y + 1 - (int32_t)(a[r->as + r->cnt].y>>32&0xff); - } else re1 = mi->seq[rid].len, qe1 = qlen; // no adjacent next chain on the same chr + re1 = (int32_t)a[r->as + r->cnt].x + 1; + } else re1 = mi->seq[rid].len; // no adjacent next chain on the same chr if (qe < qlen && re < mi->seq[rid].len) { l = qlen - qe < max_end_ext? qlen - qe : max_end_ext; - qe1 = qe1 < qe + l? qe1 : qe + l; // choose between the max_end_ext bound and the next seed bound + qe1 = qe + l; qe0 = qe0 > qe1? qe0 : qe1; // at least include qe0 l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0; l = l < max_end_ext? l : max_end_ext; @@ -418,13 +416,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs0, rs, tseq); mm_seq_rev(qs - qs0, qseq); mm_seq_rev(rs - rs0, tseq); - mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); + mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, opt->end_bonus, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; } - rs1 = rs - (ez->max_t + 1); - qs1 = qs - (ez->max_q + 1); + rs1 = rs - (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1); + qs1 = qs - (ez->reach_end? qs - qs0 : ez->max_q + 1); mm_seq_rev(qs - qs0, qseq); } else rs1 = rs, qs1 = qs; re1 = rs, qe1 = qs; @@ -452,10 +450,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs); } else { // perform normal gapped alignment - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop } if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez); // second pass: lift approximate + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, extra_flag, ez); // second pass: lift approximate if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. @@ -477,13 +475,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (!dropped && qe < qe0 && re < re0) { // right extension qseq = &qseq0[rev][qe]; mm_idx_getseq(mi, rid, re, re0, tseq); - mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, opt->end_bonus, extra_flag|KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; } - re1 = re + (ez->max_t + 1); - qe1 = qe + (ez->max_q + 1); + re1 = re + (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1); + qe1 = qe + (ez->reach_end? qe0 - qe : ez->max_q + 1); } assert(qe1 <= qlen); @@ -533,7 +531,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i mm_seq_rev(tl, tseq); if (score < opt->min_dp_max) goto end_align1_inv; q_off = ql - (q_off + 1), t_off = tl - (t_off + 1); - mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), -1, KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; diff --git a/ksw2.h b/ksw2.h index fa22fe6..43047c2 100644 --- a/ksw2.h +++ b/ksw2.h @@ -26,6 +26,7 @@ typedef struct { int mte, mte_q; // max score when reaching the end of target int score; // max score reaching both ends; may be KSW_NEG_INF int m_cigar, n_cigar; + int reach_end; uint32_t *cigar; } ksw_extz_t; @@ -46,14 +47,17 @@ typedef struct { * @param flag flag (see KSW_EZ_* macros) * @param ez (out) scores and cigar */ -void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); -void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); +void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); + +void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); + int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez); void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); @@ -149,7 +153,7 @@ static inline void ksw_reset_extz(ksw_extz_t *ez) { ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1; ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF; - ez->n_cigar = 0, ez->zdropped = 0; + ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0; } static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e) diff --git a/ksw2_dispatch.c b/ksw2_dispatch.c index b125392..56fbd63 100644 --- a/ksw2_dispatch.c +++ b/ksw2_dispatch.c @@ -64,18 +64,18 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) { extern void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez); extern void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez); unsigned simd; simd = x86_simd(); if (simd & SIMD_SSE4_1) - ksw_extd2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez); + ksw_extd2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, end_bonus, flag, ez); else if (simd & SIMD_SSE2) - ksw_extd2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez); + ksw_extd2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, end_bonus, flag, ez); else abort(); } diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index c678a11..628f37f 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -17,14 +17,14 @@ #ifdef KSW_CPU_DISPATCH #ifdef __SSE4_1__ void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) #else void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) #endif #else void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) #endif // ~KSW_CPU_DISPATCH { #define __dp_code_block1 \ @@ -380,10 +380,14 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (!approx_max) kfree(km, H); if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); - if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) + if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) { ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); - else if (ez->max_t >= 0 && ez->max_q >= 0) + } else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > ez->max) { + ez->reach_end = 1; + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } else if (ez->max_t >= 0 && ez->max_q >= 0) { ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } kfree(km, mem2); kfree(km, off); } } diff --git a/main.c b/main.c index a03c5c8..39574c5 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r498-dirty" +#define MM_VERSION "2.2-r499-dirty" #ifdef __linux__ #include @@ -40,6 +40,7 @@ static struct option long_options[] = { { "frag", optional_argument, 0, 0 }, { "print-2nd", optional_argument, 0, 0 }, { "cs", optional_argument, 0, 0 }, + { "end-bonus", required_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -130,6 +131,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr + else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus else if (c == 0 && long_idx == 14) { // --frag if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) opt.flag |= MM_F_FRAG_MODE; diff --git a/map.c b/map.c index da09ec4..ac66e81 100644 --- a/map.c +++ b/map.c @@ -33,6 +33,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; opt->zdrop = 400; + opt->end_bonus = -1; opt->min_dp_max = opt->min_chain_score * opt->a; opt->min_ksw_len = 200; opt->mini_batch_size = 200000000; @@ -86,6 +87,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->pe_ori = 0<<1|1; // FR mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1; mo->zdrop = 100; + mo->end_bonus = 11; mo->max_frag_len = 800; mo->max_gap = 100; mo->bw = 100; diff --git a/minimap.h b/minimap.h index 4b44111..281bf83 100644 --- a/minimap.h +++ b/minimap.h @@ -104,6 +104,7 @@ typedef struct { int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties int noncan; // cost of non-canonical splicing sites int zdrop; // break alignment if alignment score drops too fast along the diagonal + int end_bonus; int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold int min_ksw_len;