diff --git a/align.c b/align.c index ddbb0bd..f23114e 100644 --- a/align.c +++ b/align.c @@ -326,9 +326,11 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) { ksw_reset_extz(ez); ez->zdropped = 1; - } else if (opt->flag & MM_F_SPLICE) - ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez); - else if (opt->q == opt->q2 && opt->e == opt->e2) + } else if (opt->flag & MM_F_SPLICE) { + int flag_tmp = flag; + if (opt->flag & MM_F_SPLICE_CMPLX) flag_tmp |= KSW_EZ_SPLICE_CMPLX; + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag_tmp, junc, ez); + } else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez); else ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez); diff --git a/ksw2.h b/ksw2.h index cbd1ddc..1f94c6f 100644 --- a/ksw2.h +++ b/ksw2.h @@ -15,6 +15,7 @@ #define KSW_EZ_SPLICE_FOR 0x100 #define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_FLANK 0x400 +#define KSW_EZ_SPLICE_CMPLX 0x800 // The subset of CIGAR operators used by ksw code. // Use MM_CIGAR_* from minimap.h if you need the full list. diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 4157e38..c5f9e76 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -62,6 +62,8 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin b = _mm_sub_epi8(b, tmp); \ a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_)); + const int sp[4] = { 4, 7, 10, 15 }; +// const int sp[4] = { 3, 5, 7, 10 }; int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff; int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); int32_t *H = 0, H0 = 0, last_H0_t = 0; @@ -71,6 +73,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin ksw_reset_extz(ez); if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return; + assert((flag & KSW_EZ_SPLICE_FOR) == 0 || (flag & KSW_EZ_SPLICE_REV) == 0); // can't be both set zero_ = _mm_set1_epi8(0); q_ = _mm_set1_epi8(q); @@ -117,7 +120,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin memcpy(sf, target, tlen); // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding! - if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { + if ((flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) && !(flag & KSW_EZ_SPLICE_CMPLX)) { int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272 memset(donor, -noncan, tlen_ * 16); memset(acceptor, -noncan, tlen_ * 16); @@ -168,6 +171,92 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8))) ((int8_t*)acceptor)[t] += junc_bonus; } + } else if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { + memset(donor, -sp[3], tlen_ * 16); + memset(acceptor, -sp[3], tlen_ * 16); + if (!(flag & KSW_EZ_REV_CIGAR)) { + for (t = 0; t < tlen - 4; ++t) { + int x = 3, y = 3, z; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t+1] == 2 && target[t+2] == 3) // GT. + x = target[t+3] == 0 || target[t+3] == 2? -1 : 0; + else if (target[t+1] == 2 && target[t+2] == 1) x = 1; // GC. + else if (target[t+1] == 0 && target[t+2] == 3) x = 2; // AT. + } + if (flag & KSW_EZ_SPLICE_REV) { + if (target[t+1] == 1 && target[t+2] == 3) // CT. (revcomp of .AG) + y = target[t+3] == 0 || target[t+3] == 2? -1 : 0; + else if (target[t+1] == 2 && target[t+2] == 3) y = 2; // GT. (revcomp of .AC) + } + z = x < y? x : y; + ((int8_t*)donor)[t] = z < 0? 0 : -sp[z]; + } + if (junc) + for (t = 0; t < tlen - 1; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8))) + ((int8_t*)donor)[t] += junc_bonus; + for (t = 2; t < tlen; ++t) { + int x = 3, y = 3, z; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t-1] == 0 && target[t] == 2) // .AG + x = target[t-2] == 1 || target[t-2] == 3? -1 : 0; + else if (target[t-1] == 0 && target[t] == 1) x = 2; // .AC + } + if (flag & KSW_EZ_SPLICE_REV) { + if (target[t-1] == 0 && target[t] == 1) // .AC (revcomp of GT.) + y = target[t-2] == 1 || target[t-2] == 3? -1 : 0; + else if (target[t-1] == 1 && target[t] == 2) y = 1; // .CG + else if (target[t-1] == 0 && target[t] == 3) y = 1; // .AT + } + z = x < y? x : y; + ((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z]; + } + if (junc) + for (t = 0; t < tlen; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4))) + ((int8_t*)acceptor)[t] += junc_bonus; + } else { + for (t = 0; t < tlen - 4; ++t) { + int x = 3, y = 3, z; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t+1] == 2 && target[t+2] == 0) // GA. (rev of .AG) + x = target[t+3] == 1 || target[t+3] == 3? -1 : 0; + else if (target[t+1] == 1 && target[t+2] == 0) x = 2; // CA. (rev of .AC) + } + if (flag & KSW_EZ_SPLICE_REV) { + if (target[t+1] == 1 && target[t+2] == 0) // CA. (comp of GT.) + y = target[t+3] == 1 || target[t+3] == 3? -1 : 0; + else if (target[t+1] == 1 && target[t+2] == 2) y = 1; // CG. (comp of GC.) + else if (target[t+1] == 3 && target[t+2] == 0) y = 2; // TA. (comp of AT.) + } + z = x < y? x : y; + ((int8_t*)donor)[t] = z < 0? 0 : -sp[z]; + } + if (junc) + for (t = 0; t < tlen - 1; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4))) + ((int8_t*)donor)[t] += junc_bonus; + for (t = 2; t < tlen; ++t) { + int x = 3, y = 3, z; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t-1] == 3 && target[t] == 2) // .TG (rev of GT.) + x = target[t-2] == 0 || target[t-2] == 2? -1 : 0; + else if (target[t-1] == 1 && target[t] == 2) x = 1; // .CG + else if (target[t-1] == 3 && target[t] == 0) x = 2; // .TA + } + if (flag & KSW_EZ_SPLICE_REV) { + if (target[t-1] == 3 && target[t] == 1) // .TC (comp of .AG) + y = target[t-2] == 0 || target[t-2] == 2? -1 : 0; + else if (target[t-1] == 3 && target[t] == 2) y = 2; // .TG + } + z = x < y? x : y; + ((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z]; + } + if (junc) + for (t = 0; t < tlen; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8))) + ((int8_t*)acceptor)[t] += junc_bonus; + } } for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { diff --git a/main.c b/main.c index 0be9933..87be537 100644 --- a/main.c +++ b/main.c @@ -35,6 +35,7 @@ static ko_longopt_t long_options[] = { { "min-dp-len", ko_required_argument, 308 }, { "print-aln-seq", ko_no_argument, 309 }, { "splice", ko_no_argument, 310 }, + { "splice-model", ko_no_argument, 311 }, { "cost-non-gt-ag", ko_required_argument, 'C' }, { "no-long-join", ko_no_argument, 312 }, { "sr", ko_no_argument, 313 }, @@ -205,6 +206,7 @@ int main(int argc, char *argv[]) else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice + else if (c == 311) opt.flag |= MM_F_SPLICE_CMPLX; // --splice-model else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join else if (c == 313) opt.flag |= MM_F_SR; // --sr else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus diff --git a/minimap.h b/minimap.h index 13e12e0..9b24ed4 100644 --- a/minimap.h +++ b/minimap.h @@ -40,6 +40,7 @@ #define MM_F_QSTRAND (0x100000000LL) #define MM_F_NO_INV (0x200000000LL) #define MM_F_NO_HASH_NAME (0x400000000LL) +#define MM_F_SPLICE_CMPLX (0x800000000LL) #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 diff --git a/minimap2.1 b/minimap2.1 index 4d40c59..8a759db 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -588,7 +588,7 @@ Up to 20% sequence divergence. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 +.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .BR --splice-flank=yes ). In the splice mode, 1) long deletions are taken as introns and represented as the diff --git a/misc/paftools.js b/misc/paftools.js index c599d41..b2d4b28 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -3308,8 +3308,10 @@ function paf_gff2junc(args) { if (t[0][0] == '#') continue; if (t[2].toLowerCase() != feat.toLowerCase()) continue; //print(t.join("\t")); - if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) - throw Error("Can't find Parent"); + if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) { + warn("Can't find Parent"); + continue; + } t[3] = parseInt(t[3]) - 1; t[4] = parseInt(t[4]); t.unshift(m[1]);