From 1a7d782131864819f6e66a2b7eea91e9d6cd737b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 11:31:49 -0400 Subject: [PATCH] r273: cdna mapping mode for testing Differences from the typical mapping mode: * banded alignment disabled * log gap cost during chaining * zero long-gap extension during alignment * up to 100kb (by default) reference gap * bad seeding not filtered (to tune later) --- align.c | 13 ++++++++----- chain.c | 13 +++++++++---- main.c | 8 +++++++- map.c | 9 ++++++--- minimap.h | 3 ++- mmpriv.h | 2 +- 6 files changed, 33 insertions(+), 15 deletions(-) diff --git a/align.c b/align.c index 595f0b1..b57edcb 100644 --- a/align.c +++ b/align.c @@ -126,16 +126,17 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez) { + int bw = (opt->flag & MM_F_CDNA)? -1 : w; if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { int i; - fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop); + fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, bw, flag, opt->zdrop); for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr); for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->q == opt->q2 && opt->e == opt->e2) - ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); + ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, flag, ez); else - ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez); + ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, bw, opt->zdrop, flag, ez); } static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x) @@ -254,8 +255,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw = (int)(opt->bw * 1.5 + 1.); r2->cnt = 0; - mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); - mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); + if (!(opt->flag & MM_F_CDNA)) { + mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); + mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); + } else as1 = r->as, cnt1 = r->cnt; mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); diff --git a/chain.c b/chain.c index 1c72631..8894d44 100644 --- a/chain.c +++ b/chain.c @@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v) return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; } -int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km) +int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km) { // TODO: make sure this works when n has more than 32 bits int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v; int64_t i, j; @@ -42,17 +42,22 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int uint64_t ri = a[i].x; int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!! int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX; - while (st < i && ri - a[st].x > max_dist) ++st; + while (st < i && ri - a[st].x > max_dist_x) ++st; for (j = i - 1; j >= st; --j) { int64_t dr = ri - a[j].x; int32_t dq = qi - (int32_t)a[j].y, dd, sc; - if (dr == 0 || dq <= 0 || dq > max_dist) continue; + if (dr == 0 || dq <= 0 || dq > max_dist_y) continue; dd = dr > dq? dr - dq : dq - dr; if (dd > bw) continue; max_f_past = max_f_past > f[j]? max_f_past : f[j]; min_d = dq < dr? dq : dr; sc = min_d > q_span? q_span : dq < dr? dq : dr; - sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1); + if (is_cdna) { + int c_log, c_lin; + c_lin = (int)(dd * .01 * avg_qspan); + c_log = ilog2_32(dd); + sc -= c_lin < c_log? c_lin : c_log; + } else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1); sc += f[j]; if (sc > max_f) { max_f = sc, max_j = j; diff --git a/main.c b/main.c index 7244265..680d35f 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r238-dirty" +#define MM_VERSION "2.0rc1-r273-dirty" void liftrlimit() { @@ -129,6 +129,12 @@ int main(int argc, char *argv[]) k = 19, w = 19; opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200; opt.min_dp_max = 200; + } else if (strcmp(optarg, "cdna") == 0) { + k = 15, w = 5; + opt.flag |= MM_F_CDNA; + opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; + opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 70, opt.e2 = 0; + opt.zdrop = 200; } else { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); return 1; diff --git a/map.c b/map.c index 4ea111e..6e7b869 100644 --- a/map.c +++ b/map.c @@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_chain_score = 40; opt->bw = 500; opt->max_gap = 5000; + opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->mask_level = 0.5f; @@ -167,7 +168,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2) #endif mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs) { - int i, n = m_en - m_st, j, n_u; + int i, n = m_en - m_st, j, n_u, max_gap_ref; int64_t n_a; uint64_t *u; mm_match_t *m; @@ -243,7 +244,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); - n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km); + max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap; + n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_CDNA), n_a, a, &u, b->km); regs = mm_gen_regs(b->km, qlen, n_u, u, a); *n_regs = n_u; @@ -256,7 +258,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap mm_set_parent(b->km, opt->mask_level, *n_regs, regs); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); - mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle + if (!(opt->flag & MM_F_CDNA)) + mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } if (opt->flag & MM_F_CIGAR) { regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() diff --git a/minimap.h b/minimap.h index 076017f..dbd2e3e 100644 --- a/minimap.h +++ b/minimap.h @@ -14,6 +14,7 @@ #define MM_F_NO_QUAL 0x10 #define MM_F_OUT_CG 0x20 #define MM_F_OUT_CS 0x40 +#define MM_F_CDNA 0x80 #define MM_IDX_MAGIC "MMI\2" @@ -80,7 +81,7 @@ typedef struct { int flag; // see MM_F_* macros int bw; // bandwidth - int max_gap; // break a chain if there are no minimizers in a max_gap window + int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_chain_skip; int min_cnt; int min_chain_score; diff --git a/mmpriv.h b/mmpriv.h index 32b3fd1..1c03ae7 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -42,7 +42,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); -int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km); +int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);