From d11049eb32e2db912ac03cdbdf11f33ee597849a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 30 Jun 2017 14:21:44 -0400 Subject: [PATCH] r120: use max-scoring seg to control output much better now --- align.c | 53 +++++++++++++++++++++++++++++++---------------------- format.c | 2 +- main.c | 6 +++--- map.c | 2 +- minimap.h | 4 ++-- 5 files changed, 38 insertions(+), 29 deletions(-) diff --git a/align.c b/align.c index 5723ba3..0dc6dc8 100644 --- a/align.c +++ b/align.c @@ -61,18 +61,21 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c return 0; } -static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, int rev_cigar) +static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) { - uint32_t l, toff = 0, qoff = 0; - int32_t k, step, st, en; - if (rev_cigar) step = -1, st = n_cigar - 1, en = -1; - else step = 1, st = 0, en = n_cigar; - for (k = st; k != en; k += step) { - uint32_t op = cigar[k]&0xf, len = cigar[k]>>4; + uint32_t k, l, toff = 0, qoff = 0; + int32_t s = 0, max = 0; + if (p == 0) return; + for (k = 0; k < p->n_cigar; ++k) { + uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (op == 0) { for (l = 0; l < len; ++l) { - if (tseq[toff + l] > 3 || qseq[qoff + l] > 3) ++p->n_ambi; - else if (tseq[toff + l] != qseq[qoff + l]) ++p->n_diff; + int cq = qseq[qoff + l], ct = tseq[toff + l]; + if (ct > 3 || cq > 3) ++p->n_ambi; + else if (ct != cq) ++p->n_diff; + s += mat[ct * 5 + cq]; + if (s < 0) s = 0; + else max = max > s? max : s; } toff += len, qoff += len, p->blen += len; } else if (op == 1) { @@ -81,14 +84,19 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t if (qseq[qoff + l] > 3) ++n_ambi; qoff += len, p->blen += len; p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + s -= q + e * len; + if (s < 0) s = 0; } else if (op == 2) { int n_ambi = 0; for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; toff += len, p->blen += len; p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + s -= q + e * len; + if (s < 0) s = 0; } } + p->dp_max = max; } static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc() @@ -183,7 +191,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int re0 = re + l; } else re0 = re, qe0 = qe; - tseq = (uint8_t*)kmalloc(km, re0 - rs0); // TODO: we can allocate a smaller size + assert(re0 > rs0); + tseq = (uint8_t*)kmalloc(km, re0 - rs0); if (qs > 0 && rs > 0) { // left extension qseq = &qseq0[rev][qs0]; @@ -193,8 +202,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int ksw_extz2_sse(km, qs - qs0, qseq, rs - rs0, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); - mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 1); - r->p->score += ez->max; + r->p->dp_score += ez->max; } rs1 = rs - (ez->max_t + 1); qs1 = qs - (ez->max_q + 1); @@ -220,17 +228,15 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw1, opt->zdrop, KSW_EZ_APPROX_MAX, ez); if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw1, opt->zdrop, 0, ez); - if (ez->n_cigar > 0) { + if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); - mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 0); - } if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. int j; for (j = i - 1; j >= 0; --j) if ((int32_t)a[r->as + j].x < re + ez->max_t) break; dropped = 1; - r->p->score += ez->max; + r->p->dp_score += ez->max; re1 = rs + (ez->max_t + 1); qe1 = qs + (ez->max_q + 1); if (r->cnt - (j + 1) >= opt->min_cnt) { @@ -239,9 +245,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int r2->id = r->id, r->id = -1; } break; - } else { // FIXME: in rare cases, r->p can be NULL, which leads to a segfault - r->p->score += ez->score; - } + } else r->p->dp_score += ez->score; rs = re, qs = qe; } } @@ -252,8 +256,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int ksw_extz2_sse(km, qe0 - qe, qseq, re0 - re, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); - mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 0); - r->p->score += ez->max; + r->p->dp_score += ez->max; } re1 = re + (ez->max_t + 1); qe1 = qe + (ez->max_q + 1); @@ -264,6 +267,12 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (rev) r->qs = qlen - qe1, r->qe = qlen - qs1; else r->qs = qs1, r->qe = qe1; + assert(re1 - rs1 <= re0 - rs0); + if (r->p) { + mm_idx_getseq(mi, rid, rs1, re1, tseq); + mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); + } + kfree(km, tseq); } @@ -304,7 +313,7 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m int flt = 0; if (reg->cnt < opt->min_cnt) flt = 1; else if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1; - else if (reg->p->score < opt->min_dp_score && (reg->qe - reg->qs) * 2 < qlen) flt = 1; + else if (reg->p->dp_max < opt->min_dp_max) flt = 1; if (flt) free(reg->p); else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer else ++i; diff --git a/format.c b/format.c index c81aac1..092aaf4 100644 --- a/format.c +++ b/format.c @@ -60,7 +60,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) if (r->p) mm_sprintf_lite(s, "\ts1:i:%d", r->score); if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split); - if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->score, r->p->n_ambi); + if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); } void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const bseq1_t *t, const mm_reg1_t *r) diff --git a/main.c b/main.c index edfc78d..24b9e34 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r119-pre" +#define MM_VERSION "2.0-r120-pre" void liftrlimit() { @@ -89,7 +89,7 @@ int main(int argc, char *argv[]) else if (c == 'O') opt.q = atoi(optarg); else if (c == 'E') opt.e = atoi(optarg); else if (c == 'z') opt.zdrop = atoi(optarg); - else if (c == 's') opt.min_dp_score = atoi(optarg); + else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // int-rname else if (c == 'V') { @@ -147,7 +147,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -O INT gap open penalty [%d]\n", opt.q); fprintf(stderr, " -E INT gap extension penalty; a k-long gap costs {-O}+k*{-E} [%d]\n", opt.e); fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop); - fprintf(stderr, " -s INT minimal DP alignment score [%d]\n", opt.min_dp_score); + fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(stderr, " Input/Output:\n"); fprintf(stderr, " -F STR output format: sam or paf [paf]\n"); fprintf(stderr, " -c output CIGAR in PAF\n"); diff --git a/map.c b/map.c index f6c0ef9..c725ba8 100644 --- a/map.c +++ b/map.c @@ -30,7 +30,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->a = 1, opt->b = 2, opt->q = 2, opt->e = 1; opt->zdrop = 200; - opt->min_dp_score = 0; + opt->min_dp_max = opt->min_chain_score; opt->min_ksw_len = 100; } diff --git a/minimap.h b/minimap.h index f8635c3..1844ba9 100644 --- a/minimap.h +++ b/minimap.h @@ -45,7 +45,7 @@ typedef struct { typedef struct { uint32_t capacity; - int32_t score; + int32_t dp_score, dp_max; uint32_t blen; uint32_t n_diff, n_ambi; uint32_t n_cigar; @@ -84,7 +84,7 @@ typedef struct { int a, b, q, e; // matching score, mismatch, gap-open and gap-ext penalties int zdrop; - int min_dp_score; + int min_dp_max; int min_ksw_len; int max_occ;