r301: separate introns from deletions

When an intron is adjacent to a deletion, the old code count both as introns,
which lead to an inaccurate exon boundary.
This commit is contained in:
Heng Li 2017-08-18 15:31:15 +08:00
parent 64c1389e1a
commit 993a2bb521
9 changed files with 41 additions and 59 deletions

41
align.c
View File

@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
} else if (op == 1) {
score -= q + e * len, j += len;
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
} else if (op == 2) {
} else if (op == 2 || op == 3) {
score -= q + e * len, i += len;
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
}
@ -64,12 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron)
{
uint32_t k, l, toff = 0, qoff = 0;
int32_t s = 0, max = 0, min_intron_len, n_gtag = 0, n_ctac = 0;
min_intron_len = mm_min_intron_len(q, e, q_intron);
int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
if (p == 0) return;
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (op == 0) {
if (op == 0) { // match/mismatch
for (l = 0; l < len; ++l) {
int cq = qseq[qoff + l], ct = tseq[toff + l];
if (ct > 3 || cq > 3) ++p->n_ambi;
@ -79,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
else max = max > s? max : s;
}
toff += len, qoff += len, p->blen += len;
} else if (op == 1) {
} else if (op == 1) { // insertion
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
@ -87,23 +86,21 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 2) {
} else if (op == 2) { // deletion
int n_ambi = 0;
if (len < min_intron_len) {
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
toff += len, p->blen += len;
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
} else { // intron
uint8_t b[4];
b[0] = tseq[toff], b[1] = tseq[toff+1];
b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
toff += len, p->blen += len;
}
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
toff += len, p->blen += len;
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 3) { // intron
uint8_t b[4];
b[0] = tseq[toff], b[1] = tseq[toff+1];
b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
toff += len, p->blen += len;
}
}
p->dp_max = max;
@ -337,7 +334,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_idx_getseq(mi, rid, rs, re, tseq);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez);
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.

View File

@ -182,7 +182,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
}
}
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres)
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
{
s->l = 0;
mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
@ -196,12 +196,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
if (r->p && (opt_flag & MM_F_OUT_CG)) {
uint32_t k;
mm_sprintf_lite(s, "\tcg:Z:");
for (k = 0; k < r->p->n_cigar; ++k) {
int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (intron_thres > 0 && op == 2 && len >= intron_thres)
mm_sprintf_lite(s, "%dN", len);
else mm_sprintf_lite(s, "%d%c", len, "MID"[op]);
}
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & MM_F_OUT_CS))
write_cs(km, s, mi, t, r);
@ -239,7 +235,7 @@ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
} else str_copy(s, seq, seq + l);
}
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres)
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs)
{
int flag = 0;
s->l = 0;
@ -258,12 +254,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs;
int clip_char = (flag&0x800)? 'H' : 'S';
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
for (k = 0; k < r->p->n_cigar; ++k) {
int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (intron_thres > 0 && op == 2 && len >= intron_thres)
mm_sprintf_lite(s, "%dN", len);
else mm_sprintf_lite(s, "%d%c", len, "MID"[op]);
}
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
clip_len = r->rev? r->qs : t->l_seq - r->qe;
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
} else mm_sprintf_lite(s, "*");

6
ksw2.h
View File

@ -111,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
{ // p[] - lower 3 bits: which type gets the max; bit
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
uint32_t *cigar = *cigar_, tmp;
@ -132,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
}
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion

View File

@ -365,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
kfree(km, mem2); kfree(km, off);
}
}

View File

@ -348,9 +348,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
kfree(km, mem2); kfree(km, off);
}
}

View File

@ -278,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
kfree(km, mem2); kfree(km, off);
}
}

2
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r297-dirty"
#define MM_VERSION "2.0-r301-dirty"
void liftrlimit()
{

9
map.c
View File

@ -346,24 +346,21 @@ static void *worker_pipeline(void *shared, int step, void *in)
void *km = 0;
step_t *s = (step_t*)in;
const mm_idx_t *mi = p->mi;
int intron_thres = -1;
for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
free(s->buf);
if (p->opt->flag & MM_F_SPLICE)
intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2);
if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
for (i = 0; i < s->n_seq; ++i) {
mm_bseq1_t *t = &s->seq[i];
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i], intron_thres);
mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
else
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag, intron_thres);
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
puts(p->str.s);
}
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
mm_write_sam(&p->str, 0, t, 0, 0, 0, 0);
mm_write_sam(&p->str, 0, t, 0, 0, 0);
puts(p->str.s);
}
for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p);

View File

@ -42,8 +42,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_set_rg(const char *s);
void sam_write_sam_header(const mm_idx_t *idx);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
@ -62,10 +62,4 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc);
}
#endif
static inline int32_t mm_min_intron_len(int32_t q, int32_t e, int32_t q_intron)
{
int32_t x = q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX;
return x > 4? x : 4;
}
#endif