diff --git a/align.c b/align.c index c3b04d3..60bc08a 100644 --- a/align.c +++ b/align.c @@ -291,7 +291,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // } } -static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez) +static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const uint8_t *junc, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez) { if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { int i; @@ -305,7 +305,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint ksw_reset_extz(ez); ez->zdropped = 1; } else if (opt->flag & MM_F_SPLICE) - ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, flag, ez); + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez); else @@ -547,7 +547,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int { int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; - uint8_t *tseq, *qseq; + uint8_t *tseq, *qseq, *junc; int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; int32_t rs, re, qs, qe; int32_t rs1, qs1, re1, qe1; @@ -666,13 +666,16 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re0 > rs0); tseq = (uint8_t*)kmalloc(km, re0 - rs0); + junc = (uint8_t*)kmalloc(km, re0 - rs0); if (qs > 0 && rs > 0) { // left extension; probably the condition can be changed to "qs > qs0 && rs > rs0" qseq = &qseq0[rev][qs0]; mm_idx_getseq(mi, rid, rs0, rs, tseq); + mm_idx_bed_junc(mi, rid, rs0, rs, junc); mm_seq_rev(qs - qs0, qseq); mm_seq_rev(rs - rs0, tseq); - mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); + mm_seq_rev(rs - rs0, junc); + mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, junc, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; @@ -698,6 +701,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int // perform alignment qseq = &qseq0[rev][qs]; mm_idx_getseq(mi, rid, rs, re, tseq); + mm_idx_bed_junc(mi, rid, rs, re, junc); if (is_sr) { // perform ungapped alignment assert(qe - qs == re - rs); ksw_reset_extz(ez); @@ -707,11 +711,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs); } else { // perform normal gapped alignment - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop } // test Z-drop and inversion Z-drop if ((zdrop_code = mm_test_zdrop(km, opt, qseq, tseq, ez->n_cigar, ez->cigar, mat)) != 0) - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate // update CIGAR if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); @@ -737,7 +741,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (!dropped && qe < qe0 && re < re0) { // right extension qseq = &qseq0[rev][qe]; mm_idx_getseq(mi, rid, re, re0, tseq); - mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez); + mm_idx_bed_junc(mi, rid, re, re0, junc); + mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, junc, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; @@ -760,6 +765,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } kfree(km, tseq); + kfree(km, junc); } static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) @@ -793,7 +799,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i mm_seq_rev(tl, tseq); if (score < opt->min_dp_max) goto end_align1_inv; q_off = ql - (q_off + 1), t_off = tl - (t_off + 1); - mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, 0, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; diff --git a/index.c b/index.c index d842ec4..4b92fe6 100644 --- a/index.c +++ b/index.c @@ -668,7 +668,7 @@ int mm_idx_bed_read(mm_idx_t *mi, const char *fn) return mi->I? 0 : -1; } -int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s) +int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s) { int32_t i, left, right; mm_idx_intv_t *r; @@ -682,7 +682,7 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int else left = mid + 1; } for (i = left; i < r->n; ++i) { - if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0 && strand * r->a[i].strand >= 0) { + if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0) { if (r->a[i].strand > 0) { s[r->a[i].st] |= 1, s[r->a[i].en - 1] |= 2; } else { diff --git a/ksw2.h b/ksw2.h index 213c27f..c700136 100644 --- a/ksw2.h +++ b/ksw2.h @@ -61,7 +61,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez); void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); + int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez); void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez); diff --git a/ksw2_dispatch.c b/ksw2_dispatch.c index 22ca9bc..792b267 100644 --- a/ksw2_dispatch.c +++ b/ksw2_dispatch.c @@ -80,17 +80,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) { extern void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez); extern void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez); if (ksw_simd < 0) ksw_simd = x86_simd(); if (ksw_simd & SIMD_SSE4_1) - ksw_exts2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, flag, ez); + ksw_exts2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, junc_bonus, flag, junc, ez); else if (ksw_simd & SIMD_SSE2) - ksw_exts2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, flag, ez); + ksw_exts2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, junc_bonus, flag, junc, ez); else abort(); } #endif diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 308f491..f025838 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -17,14 +17,14 @@ #ifdef KSW_CPU_DISPATCH #ifdef __SSE4_1__ void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) #else void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) #endif #else void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) #endif // ~KSW_CPU_DISPATCH { #define __dp_code_block1 \ @@ -120,6 +120,10 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2; if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost; } + if (junc) + for (t = 0; t < tlen - 1; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&2))) + ((int8_t*)donor)[t] += junc_bonus; memset(acceptor, -noncan, tlen_ * 16); for (t = 2; t < tlen; ++t) { int can_type = 0; @@ -128,6 +132,10 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2; if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost; } + if (junc) + for (t = 0; t < tlen; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&1))) + ((int8_t*)acceptor)[t] += junc_bonus; } for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { diff --git a/minimap.h b/minimap.h index 947efe1..358cad7 100644 --- a/minimap.h +++ b/minimap.h @@ -128,6 +128,7 @@ typedef struct { int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties int sc_ambi; // score when one or both bases are "N" int noncan; // cost of non-canonical splicing sites + int junc_bonus; int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal int end_bonus; int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold @@ -367,7 +368,7 @@ int mm_idx_name2id(const mm_idx_t *mi, const char *name); int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); int mm_idx_bed_read(mm_idx_t *mi, const char *fn); -int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s); +int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s); // deprecated APIs for backward compatibility void mm_mapopt_init(mm_mapopt_t *opt); diff --git a/options.c b/options.c index eac73e3..ac1e5e1 100644 --- a/options.c +++ b/options.c @@ -126,6 +126,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; + mo->junc_bonus = 9; mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved } else return -1; return 0; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 1bbe749..ba1871d 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -29,6 +29,7 @@ cdef extern from "minimap.h": int a, b, q, e, q2, e2 int sc_ambi int noncan + int junc_bonus int zdrop, zdrop_inv int end_bonus int min_dp_max