miniprot-like splice model

slightly worse on iso-seq and slightly better on direct-RNA
This commit is contained in:
Heng Li 2022-10-06 09:10:17 -04:00
parent 5aa4355ca8
commit 6c2cbf7903
7 changed files with 104 additions and 7 deletions

View File

@ -326,9 +326,11 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) { if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
ksw_reset_extz(ez); ksw_reset_extz(ez);
ez->zdropped = 1; ez->zdropped = 1;
} else if (opt->flag & MM_F_SPLICE) } else if (opt->flag & MM_F_SPLICE) {
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez); int flag_tmp = flag;
else if (opt->q == opt->q2 && opt->e == opt->e2) if (opt->flag & MM_F_SPLICE_CMPLX) flag_tmp |= KSW_EZ_SPLICE_CMPLX;
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag_tmp, junc, ez);
} else if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez); ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez);
else else
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez); ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);

1
ksw2.h
View File

@ -15,6 +15,7 @@
#define KSW_EZ_SPLICE_FOR 0x100 #define KSW_EZ_SPLICE_FOR 0x100
#define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400 #define KSW_EZ_SPLICE_FLANK 0x400
#define KSW_EZ_SPLICE_CMPLX 0x800
// The subset of CIGAR operators used by ksw code. // The subset of CIGAR operators used by ksw code.
// Use MM_CIGAR_* from minimap.h if you need the full list. // Use MM_CIGAR_* from minimap.h if you need the full list.

View File

@ -62,6 +62,8 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
b = _mm_sub_epi8(b, tmp); \ b = _mm_sub_epi8(b, tmp); \
a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_)); a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
const int sp[4] = { 4, 7, 10, 15 };
// const int sp[4] = { 3, 5, 7, 10 };
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff; int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
int32_t *H = 0, H0 = 0, last_H0_t = 0; int32_t *H = 0, H0 = 0, last_H0_t = 0;
@ -71,6 +73,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
ksw_reset_extz(ez); ksw_reset_extz(ez);
if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return; if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
assert((flag & KSW_EZ_SPLICE_FOR) == 0 || (flag & KSW_EZ_SPLICE_REV) == 0); // can't be both set
zero_ = _mm_set1_epi8(0); zero_ = _mm_set1_epi8(0);
q_ = _mm_set1_epi8(q); q_ = _mm_set1_epi8(q);
@ -117,7 +120,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
memcpy(sf, target, tlen); memcpy(sf, target, tlen);
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding! // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { if ((flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) && !(flag & KSW_EZ_SPLICE_CMPLX)) {
int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272 int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
memset(donor, -noncan, tlen_ * 16); memset(donor, -noncan, tlen_ * 16);
memset(acceptor, -noncan, tlen_ * 16); memset(acceptor, -noncan, tlen_ * 16);
@ -168,6 +171,92 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8))) if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
((int8_t*)acceptor)[t] += junc_bonus; ((int8_t*)acceptor)[t] += junc_bonus;
} }
} else if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
memset(donor, -sp[3], tlen_ * 16);
memset(acceptor, -sp[3], tlen_ * 16);
if (!(flag & KSW_EZ_REV_CIGAR)) {
for (t = 0; t < tlen - 4; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t+1] == 2 && target[t+2] == 3) // GT.
x = target[t+3] == 0 || target[t+3] == 2? -1 : 0;
else if (target[t+1] == 2 && target[t+2] == 1) x = 1; // GC.
else if (target[t+1] == 0 && target[t+2] == 3) x = 2; // AT.
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t+1] == 1 && target[t+2] == 3) // CT. (revcomp of .AG)
y = target[t+3] == 0 || target[t+3] == 2? -1 : 0;
else if (target[t+1] == 2 && target[t+2] == 3) y = 2; // GT. (revcomp of .AC)
}
z = x < y? x : y;
((int8_t*)donor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen - 1; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8)))
((int8_t*)donor)[t] += junc_bonus;
for (t = 2; t < tlen; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t-1] == 0 && target[t] == 2) // .AG
x = target[t-2] == 1 || target[t-2] == 3? -1 : 0;
else if (target[t-1] == 0 && target[t] == 1) x = 2; // .AC
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t-1] == 0 && target[t] == 1) // .AC (revcomp of GT.)
y = target[t-2] == 1 || target[t-2] == 3? -1 : 0;
else if (target[t-1] == 1 && target[t] == 2) y = 1; // .CG
else if (target[t-1] == 0 && target[t] == 3) y = 1; // .AT
}
z = x < y? x : y;
((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4)))
((int8_t*)acceptor)[t] += junc_bonus;
} else {
for (t = 0; t < tlen - 4; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t+1] == 2 && target[t+2] == 0) // GA. (rev of .AG)
x = target[t+3] == 1 || target[t+3] == 3? -1 : 0;
else if (target[t+1] == 1 && target[t+2] == 0) x = 2; // CA. (rev of .AC)
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t+1] == 1 && target[t+2] == 0) // CA. (comp of GT.)
y = target[t+3] == 1 || target[t+3] == 3? -1 : 0;
else if (target[t+1] == 1 && target[t+2] == 2) y = 1; // CG. (comp of GC.)
else if (target[t+1] == 3 && target[t+2] == 0) y = 2; // TA. (comp of AT.)
}
z = x < y? x : y;
((int8_t*)donor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen - 1; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4)))
((int8_t*)donor)[t] += junc_bonus;
for (t = 2; t < tlen; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t-1] == 3 && target[t] == 2) // .TG (rev of GT.)
x = target[t-2] == 0 || target[t-2] == 2? -1 : 0;
else if (target[t-1] == 1 && target[t] == 2) x = 1; // .CG
else if (target[t-1] == 3 && target[t] == 0) x = 2; // .TA
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t-1] == 3 && target[t] == 1) // .TC (comp of .AG)
y = target[t-2] == 0 || target[t-2] == 2? -1 : 0;
else if (target[t-1] == 3 && target[t] == 2) y = 2; // .TG
}
z = x < y? x : y;
((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
((int8_t*)acceptor)[t] += junc_bonus;
}
} }
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {

2
main.c
View File

@ -35,6 +35,7 @@ static ko_longopt_t long_options[] = {
{ "min-dp-len", ko_required_argument, 308 }, { "min-dp-len", ko_required_argument, 308 },
{ "print-aln-seq", ko_no_argument, 309 }, { "print-aln-seq", ko_no_argument, 309 },
{ "splice", ko_no_argument, 310 }, { "splice", ko_no_argument, 310 },
{ "splice-model", ko_no_argument, 311 },
{ "cost-non-gt-ag", ko_required_argument, 'C' }, { "cost-non-gt-ag", ko_required_argument, 'C' },
{ "no-long-join", ko_no_argument, 312 }, { "no-long-join", ko_no_argument, 312 },
{ "sr", ko_no_argument, 313 }, { "sr", ko_no_argument, 313 },
@ -205,6 +206,7 @@ int main(int argc, char *argv[])
else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len
else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq
else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 311) opt.flag |= MM_F_SPLICE_CMPLX; // --splice-model
else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 313) opt.flag |= MM_F_SR; // --sr else if (c == 313) opt.flag |= MM_F_SR; // --sr
else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus

View File

@ -40,6 +40,7 @@
#define MM_F_QSTRAND (0x100000000LL) #define MM_F_QSTRAND (0x100000000LL)
#define MM_F_NO_INV (0x200000000LL) #define MM_F_NO_INV (0x200000000LL)
#define MM_F_NO_HASH_NAME (0x400000000LL) #define MM_F_NO_HASH_NAME (0x400000000LL)
#define MM_F_SPLICE_CMPLX (0x800000000LL)
#define MM_I_HPC 0x1 #define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2 #define MM_I_NO_SEQ 0x2

View File

@ -588,7 +588,7 @@ Up to 20% sequence divergence.
.B splice .B splice
Long-read spliced alignment Long-read spliced alignment
.RB ( -k15 .RB ( -k15
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.BR --splice-flank=yes ). .BR --splice-flank=yes ).
In the splice mode, 1) long deletions are taken as introns and represented as In the splice mode, 1) long deletions are taken as introns and represented as
the the

View File

@ -3308,8 +3308,10 @@ function paf_gff2junc(args) {
if (t[0][0] == '#') continue; if (t[0][0] == '#') continue;
if (t[2].toLowerCase() != feat.toLowerCase()) continue; if (t[2].toLowerCase() != feat.toLowerCase()) continue;
//print(t.join("\t")); //print(t.join("\t"));
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
throw Error("Can't find Parent"); warn("Can't find Parent");
continue;
}
t[3] = parseInt(t[3]) - 1; t[3] = parseInt(t[3]) - 1;
t[4] = parseInt(t[4]); t[4] = parseInt(t[4]);
t.unshift(m[1]); t.unshift(m[1]);