r273: cdna mapping mode for testing

Differences from the typical mapping mode:

* banded alignment disabled
* log gap cost during chaining
* zero long-gap extension during alignment
* up to 100kb (by default) reference gap
* bad seeding not filtered (to tune later)
This commit is contained in:
Heng Li 2017-08-08 11:31:49 -04:00
parent 4badf2fcbf
commit 1a7d782131
6 changed files with 33 additions and 15 deletions

13
align.c
View File

@ -126,16 +126,17 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez)
{
int bw = (opt->flag & MM_F_CDNA)? -1 : w;
if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
int i;
fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop);
fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, bw, flag, opt->zdrop);
for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr);
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
}
if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, flag, ez);
else
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez);
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, bw, opt->zdrop, flag, ez);
}
static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x)
@ -254,8 +255,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
bw = (int)(opt->bw * 1.5 + 1.);
r2->cnt = 0;
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
if (!(opt->flag & MM_F_CDNA)) {
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
} else as1 = r->as, cnt1 = r->cnt;
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);

13
chain.c
View File

@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km)
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j;
@ -42,17 +42,22 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int
uint64_t ri = a[i].x;
int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX;
while (st < i && ri - a[st].x > max_dist) ++st;
while (st < i && ri - a[st].x > max_dist_x) ++st;
for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc;
if (dr == 0 || dq <= 0 || dq > max_dist) continue;
if (dr == 0 || dq <= 0 || dq > max_dist_y) continue;
dd = dr > dq? dr - dq : dq - dr;
if (dd > bw) continue;
max_f_past = max_f_past > f[j]? max_f_past : f[j];
min_d = dq < dr? dq : dr;
sc = min_d > q_span? q_span : dq < dr? dq : dr;
sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
if (is_cdna) {
int c_log, c_lin;
c_lin = (int)(dd * .01 * avg_qspan);
c_log = ilog2_32(dd);
sc -= c_lin < c_log? c_lin : c_log;
} else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
sc += f[j];
if (sc > max_f) {
max_f = sc, max_j = j;

8
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0rc1-r238-dirty"
#define MM_VERSION "2.0rc1-r273-dirty"
void liftrlimit()
{
@ -129,6 +129,12 @@ int main(int argc, char *argv[])
k = 19, w = 19;
opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200;
opt.min_dp_max = 200;
} else if (strcmp(optarg, "cdna") == 0) {
k = 15, w = 5;
opt.flag |= MM_F_CDNA;
opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000;
opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 70, opt.e2 = 0;
opt.zdrop = 200;
} else {
fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
return 1;

9
map.c
View File

@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->min_chain_score = 40;
opt->bw = 500;
opt->max_gap = 5000;
opt->max_gap_ref = -1;
opt->max_chain_skip = 25;
opt->mask_level = 0.5f;
@ -167,7 +168,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2)
#endif
mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs)
{
int i, n = m_en - m_st, j, n_u;
int i, n = m_en - m_st, j, n_u, max_gap_ref;
int64_t n_a;
uint64_t *u;
mm_match_t *m;
@ -243,7 +244,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap;
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_CDNA), n_a, a, &u, b->km);
regs = mm_gen_regs(b->km, qlen, n_u, u, a);
*n_regs = n_u;
@ -256,7 +258,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
if (!(opt->flag & MM_F_CDNA))
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
if (opt->flag & MM_F_CIGAR) {
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()

View File

@ -14,6 +14,7 @@
#define MM_F_NO_QUAL 0x10
#define MM_F_OUT_CG 0x20
#define MM_F_OUT_CS 0x40
#define MM_F_CDNA 0x80
#define MM_IDX_MAGIC "MMI\2"
@ -80,7 +81,7 @@ typedef struct {
int flag; // see MM_F_* macros
int bw; // bandwidth
int max_gap; // break a chain if there are no minimizers in a max_gap window
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_chain_skip;
int min_cnt;
int min_chain_score;

View File

@ -42,7 +42,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);