From eb3ed6993d712102811af89ea6941010a99cbe4f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 20 Jan 2020 19:32:31 -0500 Subject: [PATCH] support ALT mapping --- align.c | 2 +- hit.c | 41 +++++++++++++++++++++++++++++------------ index.c | 27 +++++++++++++++++++++++++++ main.c | 7 ++++++- map.c | 14 +++++++++----- minimap.h | 7 ++++++- minimap2.1 | 3 +++ mmpriv.h | 5 +++-- options.c | 2 ++ python/cmappy.pxd | 1 + 10 files changed, 87 insertions(+), 22 deletions(-) diff --git a/align.c b/align.c index 5fe73ff..eac7056 100644 --- a/align.c +++ b/align.c @@ -908,6 +908,6 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m kfree(km, qseq0[0]); kfree(km, ez.cigar); mm_filter_regs(opt, qlen, n_regs_, regs); - mm_hit_sort(km, n_regs_, regs); + mm_hit_sort(km, n_regs_, regs, opt->alt_diff_frac); return regs; } diff --git a/hit.c b/hit.c index f43b0d6..506464b 100644 --- a/hit.c +++ b/hit.c @@ -87,6 +87,22 @@ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, return r; } +void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r) +{ + int i; + if (mi->n_alt == 0) return; + for (i = 0; i < n; ++i) + if (mi->seq[r[i].rid].is_alt) + r[i].is_alt = 1; +} + +static inline int mm_alt_score(int score, float alt_diff_frac) +{ + if (score < 0) return score; + score = (int)(score * (1.0 - alt_diff_frac) + .499); + return score > 0? score : 1; +} + void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a) { if (n <= 0 || n >= r->cnt) return; @@ -106,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a) r->split |= 1, r2->split |= 2; } -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level) // and compute mm_reg1_t::subsc +void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc { int i, j, k, *w; uint64_t *cov; @@ -147,12 +163,15 @@ skip_uncov: max = ej - sj > ei - si? ej - sj : ei - si; ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified if ((float)ol / min - (float)uncov_len / max > mask_level) { - int cnt_sub = 0; + int cnt_sub = 0, sci = ri->score; ri->parent = rp->parent; - rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score; + if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac); + rp->subsc = rp->subsc > sci? rp->subsc : sci; if (ri->cnt >= rp->cnt) cnt_sub = 1; if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP - rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max; + sci = ri->p->dp_max; + if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac); + rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci; if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1; } if (cnt_sub) ++rp->n_sub; @@ -166,7 +185,7 @@ set_parent_test: kfree(km, w); } -void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r) +void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac) { int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0; mm128_t *aux; @@ -177,13 +196,11 @@ void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r) t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t)); for (i = n_aux = 0; i < n; ++i) { if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted) - if (r[i].p) { - aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash; - has_cigar = 1; - } else { - aux[n_aux].x = (uint64_t)r[i].score << 32 | r[i].hash; - no_cigar = 1; - } + int score; + if (r[i].p) score = r[i].p->dp_max, has_cigar = 1; + else score = r[i].score, no_cigar = 1; + if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac); + aux[n_aux].x = (uint64_t)score << 32 | r[i].hash; aux[n_aux++].y = i; } else if (r[i].p) { free(r[i].p); diff --git a/index.c b/index.c index 6a49bb1..fd93dd3 100644 --- a/index.c +++ b/index.c @@ -316,6 +316,7 @@ static void *worker_pipeline(void *shared, int step, void *in) } else seq->name = 0; seq->len = s->seq[i].l_seq; seq->offset = p->sum_len; + seq->is_alt = 0; // copy the sequence if (!(p->mi->flag & MM_I_NO_SEQ)) { for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here @@ -414,6 +415,7 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha } p->offset = sum_len; p->len = strlen(s); + p->is_alt = 0; for (j = 0; j < p->len; ++j) { int c = seq_nt4_table[(uint8_t)s[j]]; uint64_t o = sum_len + j; @@ -500,6 +502,7 @@ mm_idx_t *mm_idx_load(FILE *fp) } fread(&s->len, 4, 1, fp); s->offset = sum_len; + s->is_alt = 0; sum_len += s->len; } for (i = 0; i < 1<b; ++i) { @@ -607,6 +610,30 @@ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases #include "kseq.h" KSTREAM_DECLARE(gzFile, gzread) +int mm_idx_alt_read(mm_idx_t *mi, const char *fn) +{ + int n_alt = 0; + gzFile fp; + kstream_t *ks; + kstring_t str = {0,0,0}; + fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return -1; + ks = ks_init(fp); + if (mi->h == 0) mm_idx_index_name(mi); + while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) { + char *p; + int id; + for (p = str.s; *p && !isspace(*p); ++p) { } + *p = 0; + id = mm_idx_name2id(mi, str.s); + if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt; + } + mi->n_alt = n_alt; + if (mm_verbose >= 3) + fprintf(stderr, "[M::%s] found %d ALT contigs\n", __func__, n_alt); + return n_alt; +} + #define sort_key_bed(a) ((a).st) KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4) diff --git a/main.c b/main.c index 1eb677d..35ebdf2 100644 --- a/main.c +++ b/main.c @@ -68,6 +68,8 @@ static ko_longopt_t long_options[] = { { "junc-bonus", ko_required_argument, 341 }, { "sam-hit-only", ko_no_argument, 342 }, { "chain-gap-scale",ko_required_argument, 343 }, + { "alt", ko_required_argument, 344 }, + { "alt-diff", ko_required_argument, 345 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -110,7 +112,7 @@ int main(int argc, char *argv[]) mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, n_parts, old_best_n = -1; - char *fnw = 0, *rg = 0, *junc_bed = 0, *s; + char *fnw = 0, *rg = 0, *junc_bed = 0, *s, *alt_list = 0; FILE *fp_help = stderr; mm_idx_reader_t *idx_rdr; mm_idx_t *mi; @@ -213,6 +215,8 @@ int main(int argc, char *argv[]) else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale + else if (c == 344) alt_list = o.arg; // --alt + else if (c == 345) opt.alt_diff_frac = atof(o.arg); // --alt-diff else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary @@ -382,6 +386,7 @@ int main(int argc, char *argv[]) if (argc != o.ind + 1) mm_mapopt_update(&opt, mi); if (mm_verbose >= 3) mm_idx_stat(mi); if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1); + if (alt_list) mm_idx_alt_read(mi, alt_list); ret = 0; if (!(opt.flag & MM_F_FRAG_MODE)) { for (i = o.ind + 1; i < argc; ++i) { diff --git a/map.c b/map.c index d924b7d..80a2a3a 100644 --- a/map.c +++ b/map.c @@ -249,7 +249,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a) { if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL); + mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains @@ -262,7 +262,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k if (!(opt->flag & MM_F_CIGAR)) return regs; regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL); + mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac); mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); mm_set_sam_pri(*n_regs, regs); } @@ -342,6 +342,10 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** b->rep_len = rep_len; regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a); + if (mi->n_alt) { + mm_mark_alt(mi, n_regs0, regs0); + mm_hit_sort(b->km, &n_regs0, regs0, opt->alt_diff_frac); // this step can be merged into mm_gen_regs(); will do if this shows up in profile + } if (mm_dbg_flag & MM_DBG_PRINT_SEED) for (j = 0; j < n_regs0; ++j) @@ -361,7 +365,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains free(regs0); for (i = 0; i < n_segs; ++i) { - mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL); // update mm_reg1_t::parent + mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac); // update mm_reg1_t::parent regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); } @@ -504,8 +508,8 @@ static void merge_hits(step_t *s) } } } - mm_hit_sort(km, &s->n_reg[k], s->reg[k]); - mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL); + mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_diff_frac); + mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac); if (!(opt->flag & MM_F_ALL_CHAINS)) { mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]); mm_set_sam_pri(s->n_reg[k], s->reg[k]); diff --git a/minimap.h b/minimap.h index 5da9c8b..eb236ae 100644 --- a/minimap.h +++ b/minimap.h @@ -58,12 +58,14 @@ typedef struct { char *name; // name of the db sequence uint64_t offset; // offset in mm_idx_t::S uint32_t len; // length + uint32_t is_alt; } mm_idx_seq_t; typedef struct { int32_t b, w, k, flag; uint32_t n_seq; // number of reference sequences int32_t index; + int32_t n_alt; mm_idx_seq_t *seq; // sequence name, length and offset uint32_t *S; // 4-bit packed sequence struct mm_idx_bucket_s *B; // index (hidden) @@ -91,7 +93,7 @@ typedef struct { int32_t mlen, blen; // seeded exact match length; seeded alignment block length int32_t n_sub; // number of suboptimal mappings int32_t score0; // initial chaining score (before chain merging/spliting) - uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, dummy:7; + uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, dummy:6; uint32_t hash; float div; mm_extra_t *p; @@ -127,6 +129,8 @@ typedef struct { int min_join_flank_sc; float min_join_flank_ratio; + float alt_diff_frac; + int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties int sc_ambi; // score when one or both bases are "N" int noncan; // cost of non-canonical splicing sites @@ -369,6 +373,7 @@ int mm_idx_index_name(mm_idx_t *mi); int mm_idx_name2id(const mm_idx_t *mi, const char *name); int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); +int mm_idx_alt_read(mm_idx_t *mi, const char *fn); int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc); int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s); diff --git a/minimap2.1 b/minimap2.1 index faa42f3..a45cbed 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -121,6 +121,9 @@ provided as the target sequences, options .BR -w , .B -I will be effectively overridden by the options stored in the index file. +.TP +.BI --alt \ FILE +List of ALT contigs [null] .SS Mapping options .TP 10 .BI -f \ FLOAT | INT1 [, INT2 ] diff --git a/mmpriv.h b/mmpriv.h index e42f0df..65c7cd9 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -73,16 +73,17 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); +void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r); void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a); int mm_set_sam_pri(int n, mm_reg1_t *r); -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level); +void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); -void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r); +void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac); void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos); diff --git a/options.c b/options.c index 171e9bc..4210bec 100644 --- a/options.c +++ b/options.c @@ -35,6 +35,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_join_flank_sc = 1000; opt->min_join_flank_ratio = 0.5f; + opt->alt_diff_frac = 0.15f; + opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; opt->sc_ambi = 1; opt->zdrop = 400, opt->zdrop_inv = 200; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index a9953bd..c8efb07 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -27,6 +27,7 @@ cdef extern from "minimap.h": int max_join_long, max_join_short int min_join_flank_sc float min_join_flank_ratio + float alt_diff_frac int a, b, q, e, q2, e2 int sc_ambi int noncan