miniprot-like splice model

slightly worse on iso-seq and slightly better on direct-RNA
This commit is contained in:
Heng Li 2022-10-06 09:10:17 -04:00
parent 5aa4355ca8
commit 6c2cbf7903
7 changed files with 104 additions and 7 deletions

View File

@ -326,9 +326,11 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
ksw_reset_extz(ez);
ez->zdropped = 1;
} else if (opt->flag & MM_F_SPLICE)
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez);
else if (opt->q == opt->q2 && opt->e == opt->e2)
} else if (opt->flag & MM_F_SPLICE) {
int flag_tmp = flag;
if (opt->flag & MM_F_SPLICE_CMPLX) flag_tmp |= KSW_EZ_SPLICE_CMPLX;
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag_tmp, junc, ez);
} else if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez);
else
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);

1
ksw2.h
View File

@ -15,6 +15,7 @@
#define KSW_EZ_SPLICE_FOR 0x100
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
#define KSW_EZ_SPLICE_CMPLX 0x800
// The subset of CIGAR operators used by ksw code.
// Use MM_CIGAR_* from minimap.h if you need the full list.

View File

@ -62,6 +62,8 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
b = _mm_sub_epi8(b, tmp); \
a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
const int sp[4] = { 4, 7, 10, 15 };
// const int sp[4] = { 3, 5, 7, 10 };
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
int32_t *H = 0, H0 = 0, last_H0_t = 0;
@ -71,6 +73,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
ksw_reset_extz(ez);
if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
assert((flag & KSW_EZ_SPLICE_FOR) == 0 || (flag & KSW_EZ_SPLICE_REV) == 0); // can't be both set
zero_ = _mm_set1_epi8(0);
q_ = _mm_set1_epi8(q);
@ -117,7 +120,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
memcpy(sf, target, tlen);
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
if ((flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) && !(flag & KSW_EZ_SPLICE_CMPLX)) {
int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
memset(donor, -noncan, tlen_ * 16);
memset(acceptor, -noncan, tlen_ * 16);
@ -168,6 +171,92 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
((int8_t*)acceptor)[t] += junc_bonus;
}
} else if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
memset(donor, -sp[3], tlen_ * 16);
memset(acceptor, -sp[3], tlen_ * 16);
if (!(flag & KSW_EZ_REV_CIGAR)) {
for (t = 0; t < tlen - 4; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t+1] == 2 && target[t+2] == 3) // GT.
x = target[t+3] == 0 || target[t+3] == 2? -1 : 0;
else if (target[t+1] == 2 && target[t+2] == 1) x = 1; // GC.
else if (target[t+1] == 0 && target[t+2] == 3) x = 2; // AT.
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t+1] == 1 && target[t+2] == 3) // CT. (revcomp of .AG)
y = target[t+3] == 0 || target[t+3] == 2? -1 : 0;
else if (target[t+1] == 2 && target[t+2] == 3) y = 2; // GT. (revcomp of .AC)
}
z = x < y? x : y;
((int8_t*)donor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen - 1; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8)))
((int8_t*)donor)[t] += junc_bonus;
for (t = 2; t < tlen; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t-1] == 0 && target[t] == 2) // .AG
x = target[t-2] == 1 || target[t-2] == 3? -1 : 0;
else if (target[t-1] == 0 && target[t] == 1) x = 2; // .AC
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t-1] == 0 && target[t] == 1) // .AC (revcomp of GT.)
y = target[t-2] == 1 || target[t-2] == 3? -1 : 0;
else if (target[t-1] == 1 && target[t] == 2) y = 1; // .CG
else if (target[t-1] == 0 && target[t] == 3) y = 1; // .AT
}
z = x < y? x : y;
((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4)))
((int8_t*)acceptor)[t] += junc_bonus;
} else {
for (t = 0; t < tlen - 4; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t+1] == 2 && target[t+2] == 0) // GA. (rev of .AG)
x = target[t+3] == 1 || target[t+3] == 3? -1 : 0;
else if (target[t+1] == 1 && target[t+2] == 0) x = 2; // CA. (rev of .AC)
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t+1] == 1 && target[t+2] == 0) // CA. (comp of GT.)
y = target[t+3] == 1 || target[t+3] == 3? -1 : 0;
else if (target[t+1] == 1 && target[t+2] == 2) y = 1; // CG. (comp of GC.)
else if (target[t+1] == 3 && target[t+2] == 0) y = 2; // TA. (comp of AT.)
}
z = x < y? x : y;
((int8_t*)donor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen - 1; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4)))
((int8_t*)donor)[t] += junc_bonus;
for (t = 2; t < tlen; ++t) {
int x = 3, y = 3, z;
if (flag & KSW_EZ_SPLICE_FOR) {
if (target[t-1] == 3 && target[t] == 2) // .TG (rev of GT.)
x = target[t-2] == 0 || target[t-2] == 2? -1 : 0;
else if (target[t-1] == 1 && target[t] == 2) x = 1; // .CG
else if (target[t-1] == 3 && target[t] == 0) x = 2; // .TA
}
if (flag & KSW_EZ_SPLICE_REV) {
if (target[t-1] == 3 && target[t] == 1) // .TC (comp of .AG)
y = target[t-2] == 0 || target[t-2] == 2? -1 : 0;
else if (target[t-1] == 3 && target[t] == 2) y = 2; // .TG
}
z = x < y? x : y;
((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z];
}
if (junc)
for (t = 0; t < tlen; ++t)
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
((int8_t*)acceptor)[t] += junc_bonus;
}
}
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {

2
main.c
View File

@ -35,6 +35,7 @@ static ko_longopt_t long_options[] = {
{ "min-dp-len", ko_required_argument, 308 },
{ "print-aln-seq", ko_no_argument, 309 },
{ "splice", ko_no_argument, 310 },
{ "splice-model", ko_no_argument, 311 },
{ "cost-non-gt-ag", ko_required_argument, 'C' },
{ "no-long-join", ko_no_argument, 312 },
{ "sr", ko_no_argument, 313 },
@ -205,6 +206,7 @@ int main(int argc, char *argv[])
else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len
else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq
else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 311) opt.flag |= MM_F_SPLICE_CMPLX; // --splice-model
else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 313) opt.flag |= MM_F_SR; // --sr
else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus

View File

@ -40,6 +40,7 @@
#define MM_F_QSTRAND (0x100000000LL)
#define MM_F_NO_INV (0x200000000LL)
#define MM_F_NO_HASH_NAME (0x400000000LL)
#define MM_F_SPLICE_CMPLX (0x800000000LL)
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2

View File

@ -588,7 +588,7 @@ Up to 20% sequence divergence.
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.BR --splice-flank=yes ).
In the splice mode, 1) long deletions are taken as introns and represented as
the

View File

@ -3308,8 +3308,10 @@ function paf_gff2junc(args) {
if (t[0][0] == '#') continue;
if (t[2].toLowerCase() != feat.toLowerCase()) continue;
//print(t.join("\t"));
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null)
throw Error("Can't find Parent");
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
warn("Can't find Parent");
continue;
}
t[3] = parseInt(t[3]) - 1;
t[4] = parseInt(t[4]);
t.unshift(m[1]);