miniprot-like splice model
slightly worse on iso-seq and slightly better on direct-RNA
This commit is contained in:
parent
5aa4355ca8
commit
6c2cbf7903
8
align.c
8
align.c
|
|
@ -326,9 +326,11 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|||
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
||||
ksw_reset_extz(ez);
|
||||
ez->zdropped = 1;
|
||||
} else if (opt->flag & MM_F_SPLICE)
|
||||
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez);
|
||||
else if (opt->q == opt->q2 && opt->e == opt->e2)
|
||||
} else if (opt->flag & MM_F_SPLICE) {
|
||||
int flag_tmp = flag;
|
||||
if (opt->flag & MM_F_SPLICE_CMPLX) flag_tmp |= KSW_EZ_SPLICE_CMPLX;
|
||||
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag_tmp, junc, ez);
|
||||
} else if (opt->q == opt->q2 && opt->e == opt->e2)
|
||||
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez);
|
||||
else
|
||||
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);
|
||||
|
|
|
|||
1
ksw2.h
1
ksw2.h
|
|
@ -15,6 +15,7 @@
|
|||
#define KSW_EZ_SPLICE_FOR 0x100
|
||||
#define KSW_EZ_SPLICE_REV 0x200
|
||||
#define KSW_EZ_SPLICE_FLANK 0x400
|
||||
#define KSW_EZ_SPLICE_CMPLX 0x800
|
||||
|
||||
// The subset of CIGAR operators used by ksw code.
|
||||
// Use MM_CIGAR_* from minimap.h if you need the full list.
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
b = _mm_sub_epi8(b, tmp); \
|
||||
a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
|
||||
|
||||
const int sp[4] = { 4, 7, 10, 15 };
|
||||
// const int sp[4] = { 3, 5, 7, 10 };
|
||||
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
|
||||
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
|
||||
int32_t *H = 0, H0 = 0, last_H0_t = 0;
|
||||
|
|
@ -71,6 +73,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
|
||||
ksw_reset_extz(ez);
|
||||
if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
|
||||
assert((flag & KSW_EZ_SPLICE_FOR) == 0 || (flag & KSW_EZ_SPLICE_REV) == 0); // can't be both set
|
||||
|
||||
zero_ = _mm_set1_epi8(0);
|
||||
q_ = _mm_set1_epi8(q);
|
||||
|
|
@ -117,7 +120,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
memcpy(sf, target, tlen);
|
||||
|
||||
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
|
||||
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
|
||||
if ((flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) && !(flag & KSW_EZ_SPLICE_CMPLX)) {
|
||||
int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
|
||||
memset(donor, -noncan, tlen_ * 16);
|
||||
memset(acceptor, -noncan, tlen_ * 16);
|
||||
|
|
@ -168,6 +171,92 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
|
||||
((int8_t*)acceptor)[t] += junc_bonus;
|
||||
}
|
||||
} else if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
|
||||
memset(donor, -sp[3], tlen_ * 16);
|
||||
memset(acceptor, -sp[3], tlen_ * 16);
|
||||
if (!(flag & KSW_EZ_REV_CIGAR)) {
|
||||
for (t = 0; t < tlen - 4; ++t) {
|
||||
int x = 3, y = 3, z;
|
||||
if (flag & KSW_EZ_SPLICE_FOR) {
|
||||
if (target[t+1] == 2 && target[t+2] == 3) // GT.
|
||||
x = target[t+3] == 0 || target[t+3] == 2? -1 : 0;
|
||||
else if (target[t+1] == 2 && target[t+2] == 1) x = 1; // GC.
|
||||
else if (target[t+1] == 0 && target[t+2] == 3) x = 2; // AT.
|
||||
}
|
||||
if (flag & KSW_EZ_SPLICE_REV) {
|
||||
if (target[t+1] == 1 && target[t+2] == 3) // CT. (revcomp of .AG)
|
||||
y = target[t+3] == 0 || target[t+3] == 2? -1 : 0;
|
||||
else if (target[t+1] == 2 && target[t+2] == 3) y = 2; // GT. (revcomp of .AC)
|
||||
}
|
||||
z = x < y? x : y;
|
||||
((int8_t*)donor)[t] = z < 0? 0 : -sp[z];
|
||||
}
|
||||
if (junc)
|
||||
for (t = 0; t < tlen - 1; ++t)
|
||||
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8)))
|
||||
((int8_t*)donor)[t] += junc_bonus;
|
||||
for (t = 2; t < tlen; ++t) {
|
||||
int x = 3, y = 3, z;
|
||||
if (flag & KSW_EZ_SPLICE_FOR) {
|
||||
if (target[t-1] == 0 && target[t] == 2) // .AG
|
||||
x = target[t-2] == 1 || target[t-2] == 3? -1 : 0;
|
||||
else if (target[t-1] == 0 && target[t] == 1) x = 2; // .AC
|
||||
}
|
||||
if (flag & KSW_EZ_SPLICE_REV) {
|
||||
if (target[t-1] == 0 && target[t] == 1) // .AC (revcomp of GT.)
|
||||
y = target[t-2] == 1 || target[t-2] == 3? -1 : 0;
|
||||
else if (target[t-1] == 1 && target[t] == 2) y = 1; // .CG
|
||||
else if (target[t-1] == 0 && target[t] == 3) y = 1; // .AT
|
||||
}
|
||||
z = x < y? x : y;
|
||||
((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z];
|
||||
}
|
||||
if (junc)
|
||||
for (t = 0; t < tlen; ++t)
|
||||
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4)))
|
||||
((int8_t*)acceptor)[t] += junc_bonus;
|
||||
} else {
|
||||
for (t = 0; t < tlen - 4; ++t) {
|
||||
int x = 3, y = 3, z;
|
||||
if (flag & KSW_EZ_SPLICE_FOR) {
|
||||
if (target[t+1] == 2 && target[t+2] == 0) // GA. (rev of .AG)
|
||||
x = target[t+3] == 1 || target[t+3] == 3? -1 : 0;
|
||||
else if (target[t+1] == 1 && target[t+2] == 0) x = 2; // CA. (rev of .AC)
|
||||
}
|
||||
if (flag & KSW_EZ_SPLICE_REV) {
|
||||
if (target[t+1] == 1 && target[t+2] == 0) // CA. (comp of GT.)
|
||||
y = target[t+3] == 1 || target[t+3] == 3? -1 : 0;
|
||||
else if (target[t+1] == 1 && target[t+2] == 2) y = 1; // CG. (comp of GC.)
|
||||
else if (target[t+1] == 3 && target[t+2] == 0) y = 2; // TA. (comp of AT.)
|
||||
}
|
||||
z = x < y? x : y;
|
||||
((int8_t*)donor)[t] = z < 0? 0 : -sp[z];
|
||||
}
|
||||
if (junc)
|
||||
for (t = 0; t < tlen - 1; ++t)
|
||||
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4)))
|
||||
((int8_t*)donor)[t] += junc_bonus;
|
||||
for (t = 2; t < tlen; ++t) {
|
||||
int x = 3, y = 3, z;
|
||||
if (flag & KSW_EZ_SPLICE_FOR) {
|
||||
if (target[t-1] == 3 && target[t] == 2) // .TG (rev of GT.)
|
||||
x = target[t-2] == 0 || target[t-2] == 2? -1 : 0;
|
||||
else if (target[t-1] == 1 && target[t] == 2) x = 1; // .CG
|
||||
else if (target[t-1] == 3 && target[t] == 0) x = 2; // .TA
|
||||
}
|
||||
if (flag & KSW_EZ_SPLICE_REV) {
|
||||
if (target[t-1] == 3 && target[t] == 1) // .TC (comp of .AG)
|
||||
y = target[t-2] == 0 || target[t-2] == 2? -1 : 0;
|
||||
else if (target[t-1] == 3 && target[t] == 2) y = 2; // .TG
|
||||
}
|
||||
z = x < y? x : y;
|
||||
((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z];
|
||||
}
|
||||
if (junc)
|
||||
for (t = 0; t < tlen; ++t)
|
||||
if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
|
||||
((int8_t*)acceptor)[t] += junc_bonus;
|
||||
}
|
||||
}
|
||||
|
||||
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
|
||||
|
|
|
|||
2
main.c
2
main.c
|
|
@ -35,6 +35,7 @@ static ko_longopt_t long_options[] = {
|
|||
{ "min-dp-len", ko_required_argument, 308 },
|
||||
{ "print-aln-seq", ko_no_argument, 309 },
|
||||
{ "splice", ko_no_argument, 310 },
|
||||
{ "splice-model", ko_no_argument, 311 },
|
||||
{ "cost-non-gt-ag", ko_required_argument, 'C' },
|
||||
{ "no-long-join", ko_no_argument, 312 },
|
||||
{ "sr", ko_no_argument, 313 },
|
||||
|
|
@ -205,6 +206,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len
|
||||
else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq
|
||||
else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice
|
||||
else if (c == 311) opt.flag |= MM_F_SPLICE_CMPLX; // --splice-model
|
||||
else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
|
||||
else if (c == 313) opt.flag |= MM_F_SR; // --sr
|
||||
else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@
|
|||
#define MM_F_QSTRAND (0x100000000LL)
|
||||
#define MM_F_NO_INV (0x200000000LL)
|
||||
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
||||
#define MM_F_SPLICE_CMPLX (0x800000000LL)
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
|
|
|
|||
|
|
@ -588,7 +588,7 @@ Up to 20% sequence divergence.
|
|||
.B splice
|
||||
Long-read spliced alignment
|
||||
.RB ( -k15
|
||||
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
|
||||
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
|
||||
.BR --splice-flank=yes ).
|
||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||
the
|
||||
|
|
|
|||
|
|
@ -3308,8 +3308,10 @@ function paf_gff2junc(args) {
|
|||
if (t[0][0] == '#') continue;
|
||||
if (t[2].toLowerCase() != feat.toLowerCase()) continue;
|
||||
//print(t.join("\t"));
|
||||
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null)
|
||||
throw Error("Can't find Parent");
|
||||
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
|
||||
warn("Can't find Parent");
|
||||
continue;
|
||||
}
|
||||
t[3] = parseInt(t[3]) - 1;
|
||||
t[4] = parseInt(t[4]);
|
||||
t.unshift(m[1]);
|
||||
|
|
|
|||
Loading…
Reference in New Issue