r93: fixed various small issues

This commit is contained in:
Heng Li 2017-06-28 10:35:21 -04:00
parent cdc2a1e29f
commit bcd9b1c621
4 changed files with 64 additions and 23 deletions

47
align.c
View File

@ -30,6 +30,7 @@ static void mm_reg_split(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *
{
if (n <= 0 || n >= r->cnt) return;
*r2 = *r;
r2->id = -1;
r2->p = 0;
r2->cnt = r->cnt - n;
r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499);
@ -243,7 +244,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
r->p->score += ez->max;
re1 = rs + (ez->max_t + 1);
qe1 = qs + (ez->max_q + 1);
mm_reg_split(r, r2, j + 1, qlen, a);
if (r->cnt - (j + 1) >= opt->min_cnt) {
mm_reg_split(r, r2, j + 1, qlen, a);
if (j + 1 < opt->min_cnt)
r2->id = r->id, r->id = -1;
}
break;
} else { // FIXME: in rare cases, r->p can be NULL, which leads to a segfault
r->p->score += ez->score;
@ -252,7 +257,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
}
}
if (r2->cnt == 0 && qe < qe0 && re < re0) { // right extension
if (i == r->cnt && qe < qe0 && re < re0) { // right extension
qseq = &qseq0[rev][qe];
mm_idx_getseq(mi, rid, re, re0, tseq);
ksw_extz2_sse(km, qe0 - qe, qseq, re0 - re, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez);
@ -276,11 +281,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
{
extern unsigned char seq_nt4_table[256];
int i, r, n_regs = *n_regs_;
int32_t i, r, n_regs = *n_regs_;
uint8_t *qseq0[2];
ksw_extz_t ez;
memset(&ez, 0, sizeof(ksw_extz_t));
// encode the query sequence
qseq0[0] = (uint8_t*)kmalloc(km, qlen);
qseq0[1] = (uint8_t*)kmalloc(km, qlen);
for (i = 0; i < qlen; ++i) {
@ -288,6 +293,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4;
}
// align through seed hits
memset(&ez, 0, sizeof(ksw_extz_t));
for (r = 0; r < n_regs; ++r) {
mm_reg1_t r2;
mm_align1(km, opt, mi, qlen, qseq0, &regs[r], &r2, a, &ez);
@ -299,8 +306,36 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
++n_regs;
}
}
*n_regs_ = n_regs;
kfree(km, qseq0[0]); kfree(km, qseq0[1]);
kfree(km, ez.cigar);
kfree(km, qseq0[0]); kfree(km, qseq0[1]); kfree(km, ez.cigar);
// filter
for (r = i = 0; r < n_regs; ++r) {
mm_reg1_t *reg = &regs[r];
int flt = 0;
if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1;
else if (reg->cnt < opt->min_cnt) flt = 1;
else if (reg->p->score < opt->min_dp_score) flt = 1;
if (flt) free(reg->p);
else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer
else ++i;
}
// remap mm_reg1_t::{id,parent}
if (i < n_regs || n_regs != *n_regs_) { // there are region splits or drops
int32_t *id;
n_regs = i;
id = (int32_t*)kmalloc(km, *n_regs_ * 4);
for (r = 0; r < *n_regs_; ++r) id[r] = -1;
for (r = 0; r < n_regs; ++r) {
if (regs[r].id >= 0) id[regs[r].id] = r;
regs[r].id = r;
}
for (r = 0; r < n_regs; ++r)
if (regs[r].parent >= 0)
regs[r].parent = id[regs[r].parent];
kfree(km, id);
}
*n_regs_ = n_regs;
return regs;
}

17
main.c
View File

@ -10,7 +10,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r92-pre"
#define MM_VERSION "2.0-r93-pre"
void liftrlimit()
{
@ -47,8 +47,9 @@ static struct option long_options[] = {
{ "int-rname", no_argument, 0, 0 },
{ "version", no_argument, 0, 'V' },
{ "min-count", required_argument, 0, 'n' },
{ "min-chain-score",required_argument, 0, 's' },
{ "min-chain-score",required_argument, 0, 'm' },
{ "mask-level", required_argument, 0, 'M' },
{ "min-dp-score", required_argument, 0, 's' },
{ 0, 0, 0, 0}
};
@ -66,7 +67,7 @@ int main(int argc, char *argv[])
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "w:k:t:r:f:Vv:g:I:d:ST:s:x:Hcp:M:n:z:F:A:B:O:E:", long_options, &long_idx)) >= 0) {
while ((c = getopt_long(argc, argv, "w:k:t:r:f:Vv:g:I:d:ST:s:x:Hcp:M:n:z:F:A:B:O:E:m:", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg);
else if (c == 'k') k = atoi(optarg);
else if (c == 'H') is_hpc = 1;
@ -82,12 +83,13 @@ int main(int argc, char *argv[])
else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
else if (c == 'T') opt.sdust_thres = atoi(optarg);
else if (c == 'n') opt.min_cnt = atoi(optarg);
else if (c == 's') opt.min_score = atoi(optarg);
else if (c == 'm') opt.min_chain_score = atoi(optarg);
else if (c == 'A') opt.a = atoi(optarg);
else if (c == 'B') opt.b = atoi(optarg);
else if (c == 'O') opt.q = atoi(optarg);
else if (c == 'E') opt.e = atoi(optarg);
else if (c == 'z') opt.zdrop = atoi(optarg);
else if (c == 's') opt.min_dp_score = atoi(optarg);
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // int-rname
else if (c == 'V') {
@ -112,7 +114,7 @@ int main(int argc, char *argv[])
} else if (c == 'x') {
if (strcmp(optarg, "ava10k") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
opt.min_score = 100, opt.pri_ratio = 0.0f;
opt.min_chain_score = 100, opt.pri_ratio = 0.0f;
is_hpc = 1, k = 19, w = 5;
}
}
@ -132,19 +134,20 @@ int main(int argc, char *argv[])
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", opt.mid_occ_frac);
fprintf(stderr, " -r INT bandwidth [%d]\n", opt.bw);
fprintf(stderr, " -n INT minimal number of minimizers [%d]\n", opt.min_cnt);
fprintf(stderr, " -s INT minimal chaining score (this is not SW score) [%d]\n", opt.min_score);
fprintf(stderr, " -m INT minimal chaining score [%d]\n", opt.min_chain_score);
fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap);
fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres);
fprintf(stderr, " -S skip self and dual mappings\n");
fprintf(stderr, " -p FLOAT threshold to output a mapping [%g]\n", opt.pri_ratio);
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -s100 (PacBio/ONT all-vs-all read mapping)\n");
fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -m100 (PacBio/ONT all-vs-all read mapping)\n");
fprintf(stderr, " Alignment:\n");
fprintf(stderr, " -A INT matching score [%d]\n", opt.a);
fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt.q);
fprintf(stderr, " -E INT gap extension penalty; a k-long gap costs {-O}+k*{-E} [%d]\n", opt.e);
fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(stderr, " -s INT minimal DP alignment score [%d]\n", opt.min_dp_score);
fprintf(stderr, " Input/Output:\n");
fprintf(stderr, " -F STR output format: sam or paf [paf]\n");
fprintf(stderr, " -c output CIGAR in PAF\n");

14
map.c
View File

@ -15,16 +15,17 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->sdust_thres = 0;
opt->min_cnt = 3;
opt->min_score = 40;
opt->min_chain_score = 40;
opt->bw = 1000;
opt->max_gap = 10000;
opt->max_skip = 15;
opt->max_chain_skip = 15;
opt->pri_ratio = 2.0f;
opt->mask_level = 0.5f;
opt->a = 1, opt->b = 1, opt->q = 1, opt->e = 1;
opt->zdrop = 100;
opt->min_dp_score = 0;
opt->min_ksw_len = 100;
}
@ -165,6 +166,7 @@ mm_reg1_t *mm_gen_reg(int qlen, int n_u, uint64_t *u, mm128_t *a)
r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t));
for (i = k = 0; i < n_u; ++i) {
mm_reg1_t *ri = &r[i];
ri->id = i;
ri->parent = -1, ri->subsc = 0;
ri->score = u[i]>>32;
ri->cnt = (int32_t)u[i];
@ -210,11 +212,12 @@ void mm_select_sub(float mask_level, float pri_ratio, int *n_, mm_reg1_t *r, voi
if (r[i].parent == i || r[i].score >= r[r[i].parent].score * pri_ratio)
tmp[i] = k, r[k++] = r[i];
n = k;
for (i = 0; i < n; ++i)
for (i = 0; i < n; ++i) // remap mm_reg1_t::parent, as some mappings may have been dropped
if (tmp[r[i].parent] >= 0)
r[i].parent = tmp[r[i].parent];
kfree(km, tmp);
*n_ = n;
for (i = 0; i < n; ++i) r[i].id = i; // reset mm_reg1_t::id
}
}
@ -286,9 +289,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
for (i = 0; i < n; ++i)
if (m[i].is_alloc) kfree(b->km, m[i].x.r);
kfree(b->km, m);
//for (i = 0; i < n_a; ++i) printf("%c\t%s\t%d\t%d\n", "+-"[a[i].x>>63], mi->seq[a[i].x<<1>>33].name, (uint32_t)a[i].x, (uint32_t)a[i].y);
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_skip, opt->min_cnt, opt->min_score, n_a, a, &u, b->km);
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
regs = mm_gen_reg(qlen, n_u, u, a);
*n_regs = n_u;
mm_select_sub(opt->mask_level, opt->pri_ratio, n_regs, regs, b->km);
@ -371,8 +373,6 @@ static void *worker_pipeline(void *shared, int step, void *in)
bseq1_t *t = &s->seq[i];
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
if (r->p && r->p->blen - r->p->n_ambi - r->p->n_diff < p->opt->min_score)
continue;
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, j, r);
else mm_write_paf(&p->str, mi, t, j, r);
puts(p->str.s);

View File

@ -53,13 +53,14 @@ typedef struct {
} mm_extra_t;
typedef struct {
int32_t id;
uint32_t cnt:30, rev:1, split:1;
uint32_t rid:31, rep:1;
int32_t score;
int32_t qs, qe, rs, re;
int32_t parent, subsc;
int32_t as;
int32_t mapq, n_sub; // TODO: n_sub is not used for now
uint32_t mapq:8, n_sub:24; // TODO: n_sub is not used for now
mm_extra_t *p;
} mm_reg1_t;
@ -70,8 +71,10 @@ typedef struct {
int flag; // see MM_F_* macros
int bw; // bandwidth
int max_gap; // break a chain if there are no minimizers in a max_gap window
int max_skip;
int min_cnt, min_score;
int max_chain_skip;
int min_cnt;
int min_chain_score;
int min_dp_score;
float pri_ratio;
float mask_level;
int a, b, q, e; // matching score, mismatch, gap-open and gap-ext penalties