r120: use max-scoring seg to control output

much better now
This commit is contained in:
Heng Li 2017-06-30 14:21:44 -04:00
parent 08a61c3cfc
commit d11049eb32
5 changed files with 38 additions and 29 deletions

53
align.c
View File

@ -61,18 +61,21 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
return 0;
}
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, int rev_cigar)
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
{
uint32_t l, toff = 0, qoff = 0;
int32_t k, step, st, en;
if (rev_cigar) step = -1, st = n_cigar - 1, en = -1;
else step = 1, st = 0, en = n_cigar;
for (k = st; k != en; k += step) {
uint32_t op = cigar[k]&0xf, len = cigar[k]>>4;
uint32_t k, l, toff = 0, qoff = 0;
int32_t s = 0, max = 0;
if (p == 0) return;
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (op == 0) {
for (l = 0; l < len; ++l) {
if (tseq[toff + l] > 3 || qseq[qoff + l] > 3) ++p->n_ambi;
else if (tseq[toff + l] != qseq[qoff + l]) ++p->n_diff;
int cq = qseq[qoff + l], ct = tseq[toff + l];
if (ct > 3 || cq > 3) ++p->n_ambi;
else if (ct != cq) ++p->n_diff;
s += mat[ct * 5 + cq];
if (s < 0) s = 0;
else max = max > s? max : s;
}
toff += len, qoff += len, p->blen += len;
} else if (op == 1) {
@ -81,14 +84,19 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
if (qseq[qoff + l] > 3) ++n_ambi;
qoff += len, p->blen += len;
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 2) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
toff += len, p->blen += len;
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
}
}
p->dp_max = max;
}
static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
@ -183,7 +191,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
re0 = re + l;
} else re0 = re, qe0 = qe;
tseq = (uint8_t*)kmalloc(km, re0 - rs0); // TODO: we can allocate a smaller size
assert(re0 > rs0);
tseq = (uint8_t*)kmalloc(km, re0 - rs0);
if (qs > 0 && rs > 0) { // left extension
qseq = &qseq0[rev][qs0];
@ -193,8 +202,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
ksw_extz2_sse(km, qs - qs0, qseq, rs - rs0, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 1);
r->p->score += ez->max;
r->p->dp_score += ez->max;
}
rs1 = rs - (ez->max_t + 1);
qs1 = qs - (ez->max_q + 1);
@ -220,17 +228,15 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw1, opt->zdrop, KSW_EZ_APPROX_MAX, ez);
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw1, opt->zdrop, 0, ez);
if (ez->n_cigar > 0) {
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 0);
}
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
int j;
for (j = i - 1; j >= 0; --j)
if ((int32_t)a[r->as + j].x < re + ez->max_t)
break;
dropped = 1;
r->p->score += ez->max;
r->p->dp_score += ez->max;
re1 = rs + (ez->max_t + 1);
qe1 = qs + (ez->max_q + 1);
if (r->cnt - (j + 1) >= opt->min_cnt) {
@ -239,9 +245,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
r2->id = r->id, r->id = -1;
}
break;
} else { // FIXME: in rare cases, r->p can be NULL, which leads to a segfault
r->p->score += ez->score;
}
} else r->p->dp_score += ez->score;
rs = re, qs = qe;
}
}
@ -252,8 +256,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
ksw_extz2_sse(km, qe0 - qe, qseq, re0 - re, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 0);
r->p->score += ez->max;
r->p->dp_score += ez->max;
}
re1 = re + (ez->max_t + 1);
qe1 = qe + (ez->max_q + 1);
@ -264,6 +267,12 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (rev) r->qs = qlen - qe1, r->qe = qlen - qs1;
else r->qs = qs1, r->qe = qe1;
assert(re1 - rs1 <= re0 - rs0);
if (r->p) {
mm_idx_getseq(mi, rid, rs1, re1, tseq);
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
}
kfree(km, tseq);
}
@ -304,7 +313,7 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
int flt = 0;
if (reg->cnt < opt->min_cnt) flt = 1;
else if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1;
else if (reg->p->score < opt->min_dp_score && (reg->qe - reg->qs) * 2 < qlen) flt = 1;
else if (reg->p->dp_max < opt->min_dp_max) flt = 1;
if (flt) free(reg->p);
else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer
else ++i;

View File

@ -60,7 +60,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
if (r->p) mm_sprintf_lite(s, "\ts1:i:%d", r->score);
if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->score, r->p->n_ambi);
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
}
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const bseq1_t *t, const mm_reg1_t *r)

6
main.c
View File

@ -10,7 +10,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r119-pre"
#define MM_VERSION "2.0-r120-pre"
void liftrlimit()
{
@ -89,7 +89,7 @@ int main(int argc, char *argv[])
else if (c == 'O') opt.q = atoi(optarg);
else if (c == 'E') opt.e = atoi(optarg);
else if (c == 'z') opt.zdrop = atoi(optarg);
else if (c == 's') opt.min_dp_score = atoi(optarg);
else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // int-rname
else if (c == 'V') {
@ -147,7 +147,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt.q);
fprintf(stderr, " -E INT gap extension penalty; a k-long gap costs {-O}+k*{-E} [%d]\n", opt.e);
fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(stderr, " -s INT minimal DP alignment score [%d]\n", opt.min_dp_score);
fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(stderr, " Input/Output:\n");
fprintf(stderr, " -F STR output format: sam or paf [paf]\n");
fprintf(stderr, " -c output CIGAR in PAF\n");

2
map.c
View File

@ -30,7 +30,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->a = 1, opt->b = 2, opt->q = 2, opt->e = 1;
opt->zdrop = 200;
opt->min_dp_score = 0;
opt->min_dp_max = opt->min_chain_score;
opt->min_ksw_len = 100;
}

View File

@ -45,7 +45,7 @@ typedef struct {
typedef struct {
uint32_t capacity;
int32_t score;
int32_t dp_score, dp_max;
uint32_t blen;
uint32_t n_diff, n_ambi;
uint32_t n_cigar;
@ -84,7 +84,7 @@ typedef struct {
int a, b, q, e; // matching score, mismatch, gap-open and gap-ext penalties
int zdrop;
int min_dp_score;
int min_dp_max;
int min_ksw_len;
int max_occ;