early exploration

This commit is contained in:
Heng Li 2017-09-19 16:18:28 -04:00
parent 11081c6c27
commit fb1bcc0084
4 changed files with 73 additions and 41 deletions

View File

@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km)
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j;
@ -42,13 +42,16 @@ int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cn
uint64_t ri = a[i].x;
int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX;
int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
while (st < i && ri - a[st].x > max_dist_x) ++st;
for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd;
int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
if (dr == 0 || dq <= 0 || dq > max_dist_y) continue;
dd = dr > dq? dr - dq : dq - dr;
if (dd > bw) continue;
if (sidi == sidj && dd > bw) continue;
if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue;
max_f_past = max_f_past > f[j]? max_f_past : f[j];
min_d = dq < dr? dq : dr;
sc = min_d > q_span? q_span : dq < dr? dq : dr;

100
map.c
View File

@ -100,6 +100,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
typedef struct {
uint32_t n;
uint32_t qpos;
uint32_t seg_id;
union {
const uint64_t *cr;
uint64_t *r;
@ -130,14 +131,14 @@ void mm_tbuf_destroy(mm_tbuf_t *b)
free(b);
}
static void mm_dust_minier(mm128_v *mini, int l_seq, const char *seq, int sdust_thres, sdust_buf_t *sdb)
static int mm_dust_minier(int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres, sdust_buf_t *sdb)
{
int n_dreg, j, k, u = 0;
const uint64_t *dreg;
if (sdust_thres <= 0 || sdb == 0) return;
if (sdust_thres <= 0 || sdb == 0) return n;
dreg = sdust_core((const uint8_t*)seq, l_seq, sdust_thres, 64, &n_dreg, sdb);
for (j = k = 0; j < mini->n; ++j) { // squeeze out minimizers that significantly overlap with LCRs
int32_t qpos = (uint32_t)mini->a[j].y>>1, span = mini->a[j].x&0xff;
for (j = k = 0; j < n; ++j) { // squeeze out minimizers that significantly overlap with LCRs
int32_t qpos = (uint32_t)a[j].y>>1, span = a[j].x&0xff;
int32_t s = qpos - (span - 1), e = s + span;
while (u < n_dreg && (uint32_t)dreg[u] <= s) ++u;
if (u < n_dreg && dreg[u]>>32 < e) {
@ -147,44 +148,47 @@ static void mm_dust_minier(mm128_v *mini, int l_seq, const char *seq, int sdust_
int ee = e < (uint32_t)dreg[v]? e : (uint32_t)dreg[v];
l += ee - ss;
}
if (l <= span>>1) mini->a[k++] = mini->a[j]; // keep the minimizer if less than half of it falls in masked region
if (l <= span>>1) a[k++] = a[j]; // keep the minimizer if less than half of it falls in masked region
}
}
mini->n = k;
return k; // the new size
}
mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
static int collect_minimizers(const mm_mapopt_t *opt, const mm_idx_t *mi, int n_segs, int *qlens, const char *seqs, mm_tbuf_t *b)
{
int i, n, j, n_u, max_gap_ref, rep_st = 0, rep_en = 0, rep_len = 0;
int64_t n_a;
uint64_t *u;
int i, j, n;
const char *seq;
b->mini.n = 0, seq = seqs;
for (i = n = 0; i < n_segs; ++i) {
mm_sketch(b->km, seq, qlens[i], mi->w, mi->k, i, mi->is_hpc, &b->mini);
for (j = n; j < b->mini.n; ++j)
b->mini.a[j].x += (seq - seqs) << 1;
if (opt->sdust_thres > 0) // mask low-complexity minimizers
b->mini.n = n + mm_dust_minier(b->mini.n - n, b->mini.a + n, qlens[i], seq, opt->sdust_thres, b->sdb);
seq += qlens[i], n = b->mini.n;
}
return seq - seqs;
}
static mm128_t *collect_seed_hits(const mm_mapopt_t *opt, const mm_idx_t *mi, const char *qname, int qlen, int64_t *n_a, int *rep_len, mm_tbuf_t *b)
{
int rep_st, rep_en, i;
mm_match_t *m;
mm128_t *a;
mm_reg1_t *regs;
// collect minimizers
b->mini.n = 0;
mm_sketch(b->km, seq, qlen, mi->w, mi->k, 0, mi->is_hpc, &b->mini);
n = b->mini.n;
if (opt->sdust_thres > 0)
mm_dust_minier(&b->mini, qlen, seq, opt->sdust_thres, b->sdb);
// convert to local representation
m = (mm_match_t*)kmalloc(b->km, n * sizeof(mm_match_t));
for (i = 0; i < n; ++i) {
m = (mm_match_t*)kmalloc(b->km, b->mini.n * sizeof(mm_match_t));
for (i = 0; i < b->mini.n; ++i) {
int t;
mm128_t *p = &b->mini.a[i];
m[i].qpos = (uint32_t)p->y;
m[i].x.cr = mm_idx_get(mi, p->x>>8, &t);
m[i].n = t;
m[i].seg_id = p->y >> 32;
}
// fill the _a_ array
for (i = 0, n_a = 0; i < n; ++i) // find the length of a[]
if (m[i].n < opt->mid_occ) n_a += m[i].n;
a = (mm128_t*)kmalloc(b->km, n_a * sizeof(mm128_t));
for (i = j = 0; i < n; ++i) {
for (i = 0, *n_a = 0; i < b->mini.n; ++i) // find the length of a[]
if (m[i].n < opt->mid_occ) *n_a += m[i].n;
a = (mm128_t*)kmalloc(b->km, *n_a * sizeof(mm128_t));
for (i = 0, *n_a = 0; i < b->mini.n; ++i) {
mm128_t *p = &b->mini.a[i];
mm_match_t *q = &m[i];
const uint64_t *r = q->x.cr;
@ -192,13 +196,13 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
if (q->n >= opt->mid_occ) {
int en = (q->qpos>>1) + 1, st = en - q_span;
if (st > rep_en) {
rep_len += rep_en - rep_st;
*rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
continue;
}
if (i > 0 && p->x>>8 == b->mini.a[i - 1].x>>8) is_tandem = 1;
if (i < n - 1 && p->x>>8 == b->mini.a[i + 1].x>>8) is_tandem = 1;
if (i < b->mini.n - 1 && p->x>>8 == b->mini.a[i + 1].x>>8) is_tandem = 1;
for (k = 0; k < q->n; ++k) {
int32_t rpos = (uint32_t)r[k] >> 1;
mm128_t *p;
@ -209,7 +213,7 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
if ((opt->flag&MM_F_AVA) && strcmp(qname, tname) > 0) // all-vs-all mode: map once
continue;
}
p = &a[j++];
p = &a[(*n_a)++];
if ((r[k]&1) == (q->qpos&1)) { // forward strand
p->x = (r[k]&0xffffffff00000000ULL) | rpos;
p->y = (uint64_t)q_span << 32 | q->qpos >> 1;
@ -217,13 +221,28 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | rpos;
p->y = (uint64_t)q_span << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1);
}
p->y |= (uint64_t)q->seg_id << MM_SEED_SEG_SHIFT;
if (is_tandem) p->y |= MM_SEED_TANDEM;
}
}
rep_len += rep_en - rep_st;
n_a = j;
radix_sort_128x(a, a + n_a);
*rep_len += rep_en - rep_st;
kfree(b->km, m);
return a;
}
mm_reg1_t *mm_map_seg(const mm_idx_t *mi, int n_segs, int *qlens, const char *seqs, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
int i, j, n_u, max_gap_ref, rep_len, qlen_sum;
int64_t n_a;
uint64_t *u;
mm128_t *a;
mm_reg1_t *regs;
if (n_segs > MM_MAX_SEG || n_segs <= 0) return 0;
qlen_sum = collect_minimizers(opt, mi, n_segs, qlens, seqs, b);
a = collect_seed_hits(opt, mi, qname, qlen_sum, &n_a, &rep_len, b);
radix_sort_128x(a, a + n_a);
if (mm_dbg_flag & MM_DBG_PRINT_SEED) {
fprintf(stderr, "RS\t%d\n", rep_len);
@ -233,8 +252,8 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
}
max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap;
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km);
regs = mm_gen_regs(b->km, qlen, n_u, u, a);
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_segs, n_a, a, &u, b->km);
regs = mm_gen_regs(b->km, qlen_sum, n_u, u, a);
*n_regs = n_u;
if (mm_dbg_flag & MM_DBG_PRINT_SEED)
@ -247,10 +266,10 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
if (!(opt->flag & MM_F_SPLICE))
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
mm_join_long(b->km, opt, qlen_sum, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
if (opt->flag & MM_F_CIGAR) {
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
regs = mm_align_skeleton(b->km, opt, mi, qlen_sum, seqs, n_regs, regs, a); // this calls mm_filter_regs()
if (!(opt->flag & MM_F_AVA)) {
mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
@ -265,6 +284,11 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
return regs;
}
mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
return mm_map_seg(mi, 1, &qlen, seq, n_regs, b, opt, qname);
}
/**************************
* Multi-threaded mapping *
**************************/

View File

@ -21,6 +21,8 @@
#define MM_IDX_MAGIC "MMI\2"
#define MM_MAX_SEG 255
#ifdef __cplusplus
extern "C" {
#endif

View File

@ -17,6 +17,9 @@
#define MM_SEED_IGNORE (1ULL<<41)
#define MM_SEED_TANDEM (1ULL<<42)
#define MM_SEED_SEG_SHIFT 48
#define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
@ -54,7 +57,7 @@ void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);