parent
8a6edab847
commit
161ae7ff73
83
align.c
83
align.c
|
|
@ -237,7 +237,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
|
|||
r->p = p;
|
||||
}
|
||||
|
||||
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int b2)
|
||||
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap)
|
||||
{
|
||||
uint32_t k, l;
|
||||
int32_t qshift, tshift, toff = 0, qoff = 0;
|
||||
|
|
@ -255,7 +255,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
int cq = qseq[qoff + l], ct = tseq[toff + l];
|
||||
if (ct > 3 || cq > 3) ++n_ambi;
|
||||
else if (ct != cq) ++n_diff;
|
||||
s += b2 <= 0? mat[ct * 5 + cq] : (ct > 3 || cq > 3)? 0 : ct == cq? 1 : -b2;
|
||||
s += mat[ct * 5 + cq];
|
||||
if (s < 0) s = 0;
|
||||
else max = max > s? max : s;
|
||||
}
|
||||
|
|
@ -266,8 +266,8 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
for (l = 0; l < len; ++l)
|
||||
if (qseq[qoff + l] > 3) ++n_ambi;
|
||||
r->blen += len - n_ambi, p->n_ambi += n_ambi;
|
||||
if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
|
||||
else s -= q + e * len;
|
||||
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
|
||||
else s -= q + e;
|
||||
if (s < 0) s = 0;
|
||||
qoff += len;
|
||||
} else if (op == MM_CIGAR_DEL) {
|
||||
|
|
@ -275,8 +275,8 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
for (l = 0; l < len; ++l)
|
||||
if (tseq[toff + l] > 3) ++n_ambi;
|
||||
r->blen += len - n_ambi, p->n_ambi += n_ambi;
|
||||
if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
|
||||
else s -= q + e * len;
|
||||
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
|
||||
else s -= q + e;
|
||||
if (s < 0) s = 0;
|
||||
toff += len;
|
||||
} else if (op == MM_CIGAR_N_SKIP) {
|
||||
|
|
@ -284,7 +284,6 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
}
|
||||
}
|
||||
p->dp_max = (int32_t)(max + .499);
|
||||
if (b2 > 0) p->dp_max *= mat[0]; // for compatibility with mm_set_mapq()
|
||||
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
||||
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
||||
}
|
||||
|
|
@ -817,7 +816,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
mm_idx_getseq(mi, rid, rs1, re1, tseq);
|
||||
qseq = &qseq0[r->rev][qs1];
|
||||
}
|
||||
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
|
||||
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
|
||||
if (rev && r->p->trans_strand)
|
||||
r->p->trans_strand ^= 3; // flip to the read strand
|
||||
}
|
||||
|
|
@ -876,7 +875,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|||
}
|
||||
r_inv->rs = r1->re + t_off;
|
||||
r_inv->re = r_inv->rs + ez->max_t + 1;
|
||||
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
|
||||
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
|
||||
ret = 1;
|
||||
end_align1_inv:
|
||||
kfree(km, tseq);
|
||||
|
|
@ -893,6 +892,70 @@ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, m
|
|||
return regs;
|
||||
}
|
||||
|
||||
static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_)
|
||||
{
|
||||
uint32_t i;
|
||||
int32_t n_gapo = 0, n_gap = 0;
|
||||
*n_gap_ = *n_gapo_ = -1;
|
||||
if (r->p == 0) return;
|
||||
for (i = 0; i < r->p->n_cigar; ++i) {
|
||||
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
||||
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
|
||||
++n_gapo, n_gap += len;
|
||||
}
|
||||
*n_gap_ = n_gap, *n_gapo_ = n_gapo;
|
||||
}
|
||||
|
||||
double mm_event_identity(const mm_reg1_t *r)
|
||||
{
|
||||
int32_t n_gap, n_gapo;
|
||||
if (r->p == 0) return -1.0f;
|
||||
mm_count_gaps(r, &n_gap, &n_gapo);
|
||||
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
|
||||
}
|
||||
|
||||
static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
|
||||
{
|
||||
uint32_t i;
|
||||
int32_t n_gap = 0, n_gapo = 0, n_mis;
|
||||
double gap_cost = 0.0;
|
||||
if (r->p == 0) return -1;
|
||||
for (i = 0; i < r->p->n_cigar; ++i) {
|
||||
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
||||
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
|
||||
gap_cost += b2 + (double)mg_log2(1.0 + len);
|
||||
++n_gapo, n_gap += len;
|
||||
}
|
||||
}
|
||||
n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
|
||||
return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499);
|
||||
}
|
||||
|
||||
static void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b)
|
||||
{
|
||||
int32_t max = -1, max2 = -1, i, max_i = -1;
|
||||
double div, b2;
|
||||
if (n_regs < 2) return;
|
||||
for (i = 0; i < n_regs; ++i) {
|
||||
mm_reg1_t *r = ®s[i];
|
||||
if (r->p == 0) continue;
|
||||
if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i;
|
||||
else if (r->p->dp_max > max2) max2 = r->p->dp_max;
|
||||
}
|
||||
if (max_i < 0 || max < 0 || max2 < 0) return;
|
||||
if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return;
|
||||
if (max2 < (double)max * frac) return;
|
||||
div = 1. - mm_event_identity(®s[max_i]);
|
||||
if (div < 0.02) div = 0.02;
|
||||
b2 = 0.5 / div; // max value: 25
|
||||
if (b2 * a < b) b2 = (double)a / b;
|
||||
for (i = 0; i < n_regs; ++i) {
|
||||
mm_reg1_t *r = ®s[i];
|
||||
if (r->p == 0) continue;
|
||||
r->p->dp_max = mm_recal_max_dp(r, b2, a);
|
||||
}
|
||||
}
|
||||
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
|
|
@ -947,6 +1010,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
|
|||
kfree(km, qseq0[0]);
|
||||
kfree(km, ez.cigar);
|
||||
mm_filter_regs(opt, qlen, n_regs_, regs);
|
||||
if (!(opt->flag&MM_F_SR) && qlen >= opt->rank_min_len)
|
||||
mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b);
|
||||
mm_hit_sort(km, n_regs_, regs, opt->alt_drop);
|
||||
return regs;
|
||||
}
|
||||
|
|
|
|||
12
format.c
12
format.c
|
|
@ -271,18 +271,6 @@ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_r
|
|||
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0, 0);
|
||||
}
|
||||
|
||||
double mm_event_identity(const mm_reg1_t *r)
|
||||
{
|
||||
int32_t i, n_gapo = 0, n_gap = 0;
|
||||
if (r->p == 0) return -1.0f;
|
||||
for (i = 0; i < r->p->n_cigar; ++i) {
|
||||
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
||||
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
|
||||
++n_gapo, n_gap += len;
|
||||
}
|
||||
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
|
||||
}
|
||||
|
||||
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
||||
{
|
||||
int type;
|
||||
|
|
|
|||
6
main.c
6
main.c
|
|
@ -7,7 +7,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "ketopt.h"
|
||||
|
||||
#define MM_VERSION "2.21-dev-r1078-dirty"
|
||||
#define MM_VERSION "2.21-dev-r1079-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -116,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:b:";
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
|
||||
ketopt_t o = KETOPT_INIT;
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
|
|
@ -182,7 +182,6 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'R') rg = o.arg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
|
||||
else if (c == 'b') opt.b2 = atoi(o.arg);
|
||||
else if (c == 'o') {
|
||||
if (strcmp(o.arg, "-") != 0) {
|
||||
if (freopen(o.arg, "wb", stdout) == NULL) {
|
||||
|
|
@ -332,7 +331,6 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b);
|
||||
fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
|
||||
fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
|
||||
fprintf(fp_help, " -b INT ranking penalty (independent of A/B/O/E; 0 to disable) [%d]\n", opt.b2);
|
||||
fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv);
|
||||
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
|
||||
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
|
||||
|
|
|
|||
|
|
@ -147,7 +147,6 @@ typedef struct {
|
|||
float alt_drop;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int b2; // used for re-ranking hits
|
||||
int sc_ambi; // score when one or both bases are "N"
|
||||
int noncan; // cost of non-canonical splicing sites
|
||||
int junc_bonus;
|
||||
|
|
@ -158,6 +157,9 @@ typedef struct {
|
|||
int anchor_ext_len, anchor_ext_shift;
|
||||
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
|
||||
|
||||
int rank_min_len;
|
||||
float rank_frac;
|
||||
|
||||
int pe_ori, pe_bonus;
|
||||
|
||||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||
|
|
|
|||
1
mmpriv.h
1
mmpriv.h
|
|
@ -62,6 +62,7 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
|
|||
|
||||
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
|
||||
|
||||
double mm_event_identity(const mm_reg1_t *r);
|
||||
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
|
||||
|
|
|
|||
13
options.c
13
options.c
|
|
@ -43,7 +43,6 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->alt_drop = 0.15f;
|
||||
|
||||
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
||||
opt->b2 = 5;
|
||||
opt->sc_ambi = 1;
|
||||
opt->zdrop = 400, opt->zdrop_inv = 200;
|
||||
opt->end_bonus = -1;
|
||||
|
|
@ -54,6 +53,9 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->mini_batch_size = 500000000;
|
||||
opt->max_sw_mat = 100000000;
|
||||
|
||||
opt->rank_min_len = 500;
|
||||
opt->rank_frac = 0.9f;
|
||||
|
||||
opt->pe_ori = 0; // FF
|
||||
opt->pe_bonus = 33;
|
||||
}
|
||||
|
|
@ -69,11 +71,6 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
|||
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
|
||||
opt->mid_occ = opt->max_mid_occ;
|
||||
}
|
||||
if (opt->b2 > 0 && opt->b2 * opt->a < opt->b) {
|
||||
opt->b2 = (int)opt->b / opt->a;
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] addjusted the ranking penalty to %d\n", __func__, opt->b2);
|
||||
}
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
|
||||
}
|
||||
|
|
@ -131,7 +128,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
io->flag = 0, io->k = 21, io->w = 11;
|
||||
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
|
||||
mo->pe_ori = 0<<1|1; // FR
|
||||
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1, mo->b2 = 0;
|
||||
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
|
||||
mo->zdrop = mo->zdrop_inv = 100;
|
||||
mo->end_bonus = 10;
|
||||
mo->max_frag_len = 800;
|
||||
|
|
@ -150,7 +147,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
|
||||
mo->max_sw_mat = 0;
|
||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0, mo->b2 = 0;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
||||
mo->noncan = 9;
|
||||
mo->junc_bonus = 9;
|
||||
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
|
||||
|
|
|
|||
Loading…
Reference in New Issue