r301: separate introns from deletions
When an intron is adjacent to a deletion, the old code count both as introns, which lead to an inaccurate exon boundary.
This commit is contained in:
parent
64c1389e1a
commit
993a2bb521
41
align.c
41
align.c
|
|
@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
|
|||
} else if (op == 1) {
|
||||
score -= q + e * len, j += len;
|
||||
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
|
||||
} else if (op == 2) {
|
||||
} else if (op == 2 || op == 3) {
|
||||
score -= q + e * len, i += len;
|
||||
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
|
||||
}
|
||||
|
|
@ -64,12 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
|
|||
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron)
|
||||
{
|
||||
uint32_t k, l, toff = 0, qoff = 0;
|
||||
int32_t s = 0, max = 0, min_intron_len, n_gtag = 0, n_ctac = 0;
|
||||
min_intron_len = mm_min_intron_len(q, e, q_intron);
|
||||
int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
|
||||
if (p == 0) return;
|
||||
for (k = 0; k < p->n_cigar; ++k) {
|
||||
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
||||
if (op == 0) {
|
||||
if (op == 0) { // match/mismatch
|
||||
for (l = 0; l < len; ++l) {
|
||||
int cq = qseq[qoff + l], ct = tseq[toff + l];
|
||||
if (ct > 3 || cq > 3) ++p->n_ambi;
|
||||
|
|
@ -79,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
|
|||
else max = max > s? max : s;
|
||||
}
|
||||
toff += len, qoff += len, p->blen += len;
|
||||
} else if (op == 1) {
|
||||
} else if (op == 1) { // insertion
|
||||
int n_ambi = 0;
|
||||
for (l = 0; l < len; ++l)
|
||||
if (qseq[qoff + l] > 3) ++n_ambi;
|
||||
|
|
@ -87,23 +86,21 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
|
|||
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
|
||||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
} else if (op == 2) {
|
||||
} else if (op == 2) { // deletion
|
||||
int n_ambi = 0;
|
||||
if (len < min_intron_len) {
|
||||
for (l = 0; l < len; ++l)
|
||||
if (tseq[toff + l] > 3) ++n_ambi;
|
||||
toff += len, p->blen += len;
|
||||
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
|
||||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
} else { // intron
|
||||
uint8_t b[4];
|
||||
b[0] = tseq[toff], b[1] = tseq[toff+1];
|
||||
b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
|
||||
if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
|
||||
else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
|
||||
toff += len, p->blen += len;
|
||||
}
|
||||
for (l = 0; l < len; ++l)
|
||||
if (tseq[toff + l] > 3) ++n_ambi;
|
||||
toff += len, p->blen += len;
|
||||
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
|
||||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
} else if (op == 3) { // intron
|
||||
uint8_t b[4];
|
||||
b[0] = tseq[toff], b[1] = tseq[toff+1];
|
||||
b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
|
||||
if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
|
||||
else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
|
||||
toff += len, p->blen += len;
|
||||
}
|
||||
}
|
||||
p->dp_max = max;
|
||||
|
|
@ -337,7 +334,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
mm_idx_getseq(mi, rid, rs, re, tseq);
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
|
||||
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez);
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez);
|
||||
if (ez->n_cigar > 0)
|
||||
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
||||
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
|
||||
|
|
|
|||
20
format.c
20
format.c
|
|
@ -182,7 +182,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|||
}
|
||||
}
|
||||
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres)
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
|
||||
{
|
||||
s->l = 0;
|
||||
mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
|
||||
|
|
@ -196,12 +196,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
if (r->p && (opt_flag & MM_F_OUT_CG)) {
|
||||
uint32_t k;
|
||||
mm_sprintf_lite(s, "\tcg:Z:");
|
||||
for (k = 0; k < r->p->n_cigar; ++k) {
|
||||
int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
||||
if (intron_thres > 0 && op == 2 && len >= intron_thres)
|
||||
mm_sprintf_lite(s, "%dN", len);
|
||||
else mm_sprintf_lite(s, "%d%c", len, "MID"[op]);
|
||||
}
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
|
||||
}
|
||||
if (r->p && (opt_flag & MM_F_OUT_CS))
|
||||
write_cs(km, s, mi, t, r);
|
||||
|
|
@ -239,7 +235,7 @@ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
|
|||
} else str_copy(s, seq, seq + l);
|
||||
}
|
||||
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres)
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs)
|
||||
{
|
||||
int flag = 0;
|
||||
s->l = 0;
|
||||
|
|
@ -258,12 +254,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs;
|
||||
int clip_char = (flag&0x800)? 'H' : 'S';
|
||||
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
|
||||
for (k = 0; k < r->p->n_cigar; ++k) {
|
||||
int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
||||
if (intron_thres > 0 && op == 2 && len >= intron_thres)
|
||||
mm_sprintf_lite(s, "%dN", len);
|
||||
else mm_sprintf_lite(s, "%d%c", len, "MID"[op]);
|
||||
}
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
|
||||
clip_len = r->rev? r->qs : t->l_seq - r->qe;
|
||||
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
|
||||
} else mm_sprintf_lite(s, "*");
|
||||
|
|
|
|||
6
ksw2.h
6
ksw2.h
|
|
@ -111,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin
|
|||
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
|
||||
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
|
||||
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
|
||||
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
|
||||
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
|
||||
int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
|
||||
{ // p[] - lower 3 bits: which type gets the max; bit
|
||||
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
|
||||
uint32_t *cigar = *cigar_, tmp;
|
||||
|
|
@ -132,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t
|
|||
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
|
||||
if (force_state >= 0) state = force_state;
|
||||
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
|
||||
else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
|
||||
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
|
||||
}
|
||||
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion
|
||||
|
|
|
|||
|
|
@ -365,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -348,9 +348,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -278,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
2
main.c
2
main.c
|
|
@ -8,7 +8,7 @@
|
|||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
#define MM_VERSION "2.0-r297-dirty"
|
||||
#define MM_VERSION "2.0-r301-dirty"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
|
|
|
|||
9
map.c
9
map.c
|
|
@ -346,24 +346,21 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
void *km = 0;
|
||||
step_t *s = (step_t*)in;
|
||||
const mm_idx_t *mi = p->mi;
|
||||
int intron_thres = -1;
|
||||
for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
|
||||
free(s->buf);
|
||||
if (p->opt->flag & MM_F_SPLICE)
|
||||
intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2);
|
||||
if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
mm_bseq1_t *t = &s->seq[i];
|
||||
for (j = 0; j < s->n_reg[i]; ++j) {
|
||||
mm_reg1_t *r = &s->reg[i][j];
|
||||
if (p->opt->flag & MM_F_OUT_SAM)
|
||||
mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i], intron_thres);
|
||||
mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
|
||||
else
|
||||
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag, intron_thres);
|
||||
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
|
||||
puts(p->str.s);
|
||||
}
|
||||
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
|
||||
mm_write_sam(&p->str, 0, t, 0, 0, 0, 0);
|
||||
mm_write_sam(&p->str, 0, t, 0, 0, 0);
|
||||
puts(p->str.s);
|
||||
}
|
||||
for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p);
|
||||
|
|
|
|||
10
mmpriv.h
10
mmpriv.h
|
|
@ -42,8 +42,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
|||
|
||||
void mm_set_rg(const char *s);
|
||||
void sam_write_sam_header(const mm_idx_t *idx);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
||||
|
|
@ -62,10 +62,4 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc);
|
|||
}
|
||||
#endif
|
||||
|
||||
static inline int32_t mm_min_intron_len(int32_t q, int32_t e, int32_t q_intron)
|
||||
{
|
||||
int32_t x = q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX;
|
||||
return x > 4? x : 4;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue