support ALT mapping
This commit is contained in:
parent
d2e14705e7
commit
eb3ed6993d
2
align.c
2
align.c
|
|
@ -908,6 +908,6 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
|
|||
kfree(km, qseq0[0]);
|
||||
kfree(km, ez.cigar);
|
||||
mm_filter_regs(opt, qlen, n_regs_, regs);
|
||||
mm_hit_sort(km, n_regs_, regs);
|
||||
mm_hit_sort(km, n_regs_, regs, opt->alt_diff_frac);
|
||||
return regs;
|
||||
}
|
||||
|
|
|
|||
41
hit.c
41
hit.c
|
|
@ -87,6 +87,22 @@ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u,
|
|||
return r;
|
||||
}
|
||||
|
||||
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r)
|
||||
{
|
||||
int i;
|
||||
if (mi->n_alt == 0) return;
|
||||
for (i = 0; i < n; ++i)
|
||||
if (mi->seq[r[i].rid].is_alt)
|
||||
r[i].is_alt = 1;
|
||||
}
|
||||
|
||||
static inline int mm_alt_score(int score, float alt_diff_frac)
|
||||
{
|
||||
if (score < 0) return score;
|
||||
score = (int)(score * (1.0 - alt_diff_frac) + .499);
|
||||
return score > 0? score : 1;
|
||||
}
|
||||
|
||||
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a)
|
||||
{
|
||||
if (n <= 0 || n >= r->cnt) return;
|
||||
|
|
@ -106,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a)
|
|||
r->split |= 1, r2->split |= 2;
|
||||
}
|
||||
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level) // and compute mm_reg1_t::subsc
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
|
||||
{
|
||||
int i, j, k, *w;
|
||||
uint64_t *cov;
|
||||
|
|
@ -147,12 +163,15 @@ skip_uncov:
|
|||
max = ej - sj > ei - si? ej - sj : ei - si;
|
||||
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
|
||||
if ((float)ol / min - (float)uncov_len / max > mask_level) {
|
||||
int cnt_sub = 0;
|
||||
int cnt_sub = 0, sci = ri->score;
|
||||
ri->parent = rp->parent;
|
||||
rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
|
||||
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
||||
rp->subsc = rp->subsc > sci? rp->subsc : sci;
|
||||
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
||||
if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
|
||||
rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max;
|
||||
sci = ri->p->dp_max;
|
||||
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
||||
rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci;
|
||||
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
|
||||
}
|
||||
if (cnt_sub) ++rp->n_sub;
|
||||
|
|
@ -166,7 +185,7 @@ set_parent_test:
|
|||
kfree(km, w);
|
||||
}
|
||||
|
||||
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r)
|
||||
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac)
|
||||
{
|
||||
int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
|
||||
mm128_t *aux;
|
||||
|
|
@ -177,13 +196,11 @@ void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r)
|
|||
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
|
||||
for (i = n_aux = 0; i < n; ++i) {
|
||||
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
|
||||
if (r[i].p) {
|
||||
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
|
||||
has_cigar = 1;
|
||||
} else {
|
||||
aux[n_aux].x = (uint64_t)r[i].score << 32 | r[i].hash;
|
||||
no_cigar = 1;
|
||||
}
|
||||
int score;
|
||||
if (r[i].p) score = r[i].p->dp_max, has_cigar = 1;
|
||||
else score = r[i].score, no_cigar = 1;
|
||||
if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac);
|
||||
aux[n_aux].x = (uint64_t)score << 32 | r[i].hash;
|
||||
aux[n_aux++].y = i;
|
||||
} else if (r[i].p) {
|
||||
free(r[i].p);
|
||||
|
|
|
|||
27
index.c
27
index.c
|
|
@ -316,6 +316,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
} else seq->name = 0;
|
||||
seq->len = s->seq[i].l_seq;
|
||||
seq->offset = p->sum_len;
|
||||
seq->is_alt = 0;
|
||||
// copy the sequence
|
||||
if (!(p->mi->flag & MM_I_NO_SEQ)) {
|
||||
for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
|
||||
|
|
@ -414,6 +415,7 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
|
|||
}
|
||||
p->offset = sum_len;
|
||||
p->len = strlen(s);
|
||||
p->is_alt = 0;
|
||||
for (j = 0; j < p->len; ++j) {
|
||||
int c = seq_nt4_table[(uint8_t)s[j]];
|
||||
uint64_t o = sum_len + j;
|
||||
|
|
@ -500,6 +502,7 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
|||
}
|
||||
fread(&s->len, 4, 1, fp);
|
||||
s->offset = sum_len;
|
||||
s->is_alt = 0;
|
||||
sum_len += s->len;
|
||||
}
|
||||
for (i = 0; i < 1<<mi->b; ++i) {
|
||||
|
|
@ -607,6 +610,30 @@ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases
|
|||
#include "kseq.h"
|
||||
KSTREAM_DECLARE(gzFile, gzread)
|
||||
|
||||
int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
|
||||
{
|
||||
int n_alt = 0;
|
||||
gzFile fp;
|
||||
kstream_t *ks;
|
||||
kstring_t str = {0,0,0};
|
||||
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||
if (fp == 0) return -1;
|
||||
ks = ks_init(fp);
|
||||
if (mi->h == 0) mm_idx_index_name(mi);
|
||||
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
||||
char *p;
|
||||
int id;
|
||||
for (p = str.s; *p && !isspace(*p); ++p) { }
|
||||
*p = 0;
|
||||
id = mm_idx_name2id(mi, str.s);
|
||||
if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt;
|
||||
}
|
||||
mi->n_alt = n_alt;
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] found %d ALT contigs\n", __func__, n_alt);
|
||||
return n_alt;
|
||||
}
|
||||
|
||||
#define sort_key_bed(a) ((a).st)
|
||||
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
|
||||
|
||||
|
|
|
|||
7
main.c
7
main.c
|
|
@ -68,6 +68,8 @@ static ko_longopt_t long_options[] = {
|
|||
{ "junc-bonus", ko_required_argument, 341 },
|
||||
{ "sam-hit-only", ko_no_argument, 342 },
|
||||
{ "chain-gap-scale",ko_required_argument, 343 },
|
||||
{ "alt", ko_required_argument, 344 },
|
||||
{ "alt-diff", ko_required_argument, 345 },
|
||||
{ "help", ko_no_argument, 'h' },
|
||||
{ "max-intron-len", ko_required_argument, 'G' },
|
||||
{ "version", ko_no_argument, 'V' },
|
||||
|
|
@ -110,7 +112,7 @@ int main(int argc, char *argv[])
|
|||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
int i, c, n_threads = 3, n_parts, old_best_n = -1;
|
||||
char *fnw = 0, *rg = 0, *junc_bed = 0, *s;
|
||||
char *fnw = 0, *rg = 0, *junc_bed = 0, *s, *alt_list = 0;
|
||||
FILE *fp_help = stderr;
|
||||
mm_idx_reader_t *idx_rdr;
|
||||
mm_idx_t *mi;
|
||||
|
|
@ -213,6 +215,8 @@ int main(int argc, char *argv[])
|
|||
else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus
|
||||
else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only
|
||||
else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
|
||||
else if (c == 344) alt_list = o.arg; // --alt
|
||||
else if (c == 345) opt.alt_diff_frac = atof(o.arg); // --alt-diff
|
||||
else if (c == 314) { // --frag
|
||||
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
|
||||
} else if (c == 315) { // --secondary
|
||||
|
|
@ -382,6 +386,7 @@ int main(int argc, char *argv[])
|
|||
if (argc != o.ind + 1) mm_mapopt_update(&opt, mi);
|
||||
if (mm_verbose >= 3) mm_idx_stat(mi);
|
||||
if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1);
|
||||
if (alt_list) mm_idx_alt_read(mi, alt_list);
|
||||
ret = 0;
|
||||
if (!(opt.flag & MM_F_FRAG_MODE)) {
|
||||
for (i = o.ind + 1; i < argc; ++i) {
|
||||
|
|
|
|||
14
map.c
14
map.c
|
|
@ -249,7 +249,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
|||
static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a)
|
||||
{
|
||||
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
|
||||
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL);
|
||||
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac);
|
||||
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
|
||||
if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains
|
||||
|
|
@ -262,7 +262,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
|
|||
if (!(opt->flag & MM_F_CIGAR)) return regs;
|
||||
regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
|
||||
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
|
||||
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL);
|
||||
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac);
|
||||
mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
mm_set_sam_pri(*n_regs, regs);
|
||||
}
|
||||
|
|
@ -342,6 +342,10 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
b->rep_len = rep_len;
|
||||
|
||||
regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a);
|
||||
if (mi->n_alt) {
|
||||
mm_mark_alt(mi, n_regs0, regs0);
|
||||
mm_hit_sort(b->km, &n_regs0, regs0, opt->alt_diff_frac); // this step can be merged into mm_gen_regs(); will do if this shows up in profile
|
||||
}
|
||||
|
||||
if (mm_dbg_flag & MM_DBG_PRINT_SEED)
|
||||
for (j = 0; j < n_regs0; ++j)
|
||||
|
|
@ -361,7 +365,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains
|
||||
free(regs0);
|
||||
for (i = 0; i < n_segs; ++i) {
|
||||
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL); // update mm_reg1_t::parent
|
||||
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac); // update mm_reg1_t::parent
|
||||
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
|
||||
mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
|
||||
}
|
||||
|
|
@ -504,8 +508,8 @@ static void merge_hits(step_t *s)
|
|||
}
|
||||
}
|
||||
}
|
||||
mm_hit_sort(km, &s->n_reg[k], s->reg[k]);
|
||||
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL);
|
||||
mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_diff_frac);
|
||||
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_diff_frac);
|
||||
if (!(opt->flag & MM_F_ALL_CHAINS)) {
|
||||
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
|
||||
mm_set_sam_pri(s->n_reg[k], s->reg[k]);
|
||||
|
|
|
|||
|
|
@ -58,12 +58,14 @@ typedef struct {
|
|||
char *name; // name of the db sequence
|
||||
uint64_t offset; // offset in mm_idx_t::S
|
||||
uint32_t len; // length
|
||||
uint32_t is_alt;
|
||||
} mm_idx_seq_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t b, w, k, flag;
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
int32_t index;
|
||||
int32_t n_alt;
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
struct mm_idx_bucket_s *B; // index (hidden)
|
||||
|
|
@ -91,7 +93,7 @@ typedef struct {
|
|||
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
|
||||
int32_t n_sub; // number of suboptimal mappings
|
||||
int32_t score0; // initial chaining score (before chain merging/spliting)
|
||||
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, dummy:7;
|
||||
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, dummy:6;
|
||||
uint32_t hash;
|
||||
float div;
|
||||
mm_extra_t *p;
|
||||
|
|
@ -127,6 +129,8 @@ typedef struct {
|
|||
int min_join_flank_sc;
|
||||
float min_join_flank_ratio;
|
||||
|
||||
float alt_diff_frac;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int sc_ambi; // score when one or both bases are "N"
|
||||
int noncan; // cost of non-canonical splicing sites
|
||||
|
|
@ -369,6 +373,7 @@ int mm_idx_index_name(mm_idx_t *mi);
|
|||
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
|
||||
int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
|
||||
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
|
||||
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
|
||||
|
||||
|
|
|
|||
|
|
@ -121,6 +121,9 @@ provided as the target sequences, options
|
|||
.BR -w ,
|
||||
.B -I
|
||||
will be effectively overridden by the options stored in the index file.
|
||||
.TP
|
||||
.BI --alt \ FILE
|
||||
List of ALT contigs [null]
|
||||
.SS Mapping options
|
||||
.TP 10
|
||||
.BI -f \ FLOAT | INT1 [, INT2 ]
|
||||
|
|
|
|||
5
mmpriv.h
5
mmpriv.h
|
|
@ -73,16 +73,17 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m
|
|||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
||||
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a);
|
||||
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r);
|
||||
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a);
|
||||
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
|
||||
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a);
|
||||
int mm_set_sam_pri(int n, mm_reg1_t *r);
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level);
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
|
||||
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
|
||||
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
|
||||
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
|
||||
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
|
||||
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r);
|
||||
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
|
||||
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
|
||||
|
||||
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
|
||||
|
|
|
|||
|
|
@ -35,6 +35,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->min_join_flank_sc = 1000;
|
||||
opt->min_join_flank_ratio = 0.5f;
|
||||
|
||||
opt->alt_diff_frac = 0.15f;
|
||||
|
||||
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
||||
opt->sc_ambi = 1;
|
||||
opt->zdrop = 400, opt->zdrop_inv = 200;
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ cdef extern from "minimap.h":
|
|||
int max_join_long, max_join_short
|
||||
int min_join_flank_sc
|
||||
float min_join_flank_ratio
|
||||
float alt_diff_frac
|
||||
int a, b, q, e, q2, e2
|
||||
int sc_ambi
|
||||
int noncan
|
||||
|
|
|
|||
Loading…
Reference in New Issue