dev-r1079: per-read error rate

more tuning needed
This commit is contained in:
Heng Li 2021-07-18 20:38:53 -04:00
parent 8a6edab847
commit 161ae7ff73
6 changed files with 85 additions and 34 deletions

83
align.c
View File

@ -237,7 +237,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
r->p = p;
}
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int b2)
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap)
{
uint32_t k, l;
int32_t qshift, tshift, toff = 0, qoff = 0;
@ -255,7 +255,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
int cq = qseq[qoff + l], ct = tseq[toff + l];
if (ct > 3 || cq > 3) ++n_ambi;
else if (ct != cq) ++n_diff;
s += b2 <= 0? mat[ct * 5 + cq] : (ct > 3 || cq > 3)? 0 : ct == cq? 1 : -b2;
s += mat[ct * 5 + cq];
if (s < 0) s = 0;
else max = max > s? max : s;
}
@ -266,8 +266,8 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
r->blen += len - n_ambi, p->n_ambi += n_ambi;
if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
else s -= q + e * len;
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
else s -= q + e;
if (s < 0) s = 0;
qoff += len;
} else if (op == MM_CIGAR_DEL) {
@ -275,8 +275,8 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
r->blen += len - n_ambi, p->n_ambi += n_ambi;
if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
else s -= q + e * len;
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
else s -= q + e;
if (s < 0) s = 0;
toff += len;
} else if (op == MM_CIGAR_N_SKIP) {
@ -284,7 +284,6 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
}
}
p->dp_max = (int32_t)(max + .499);
if (b2 > 0) p->dp_max *= mat[0]; // for compatibility with mm_set_mapq()
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
}
@ -817,7 +816,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_idx_getseq(mi, rid, rs1, re1, tseq);
qseq = &qseq0[r->rev][qs1];
}
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
if (rev && r->p->trans_strand)
r->p->trans_strand ^= 3; // flip to the read strand
}
@ -876,7 +875,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
}
r_inv->rs = r1->re + t_off;
r_inv->re = r_inv->rs + ez->max_t + 1;
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
ret = 1;
end_align1_inv:
kfree(km, tseq);
@ -893,6 +892,70 @@ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, m
return regs;
}
static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_)
{
uint32_t i;
int32_t n_gapo = 0, n_gap = 0;
*n_gap_ = *n_gapo_ = -1;
if (r->p == 0) return;
for (i = 0; i < r->p->n_cigar; ++i) {
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
++n_gapo, n_gap += len;
}
*n_gap_ = n_gap, *n_gapo_ = n_gapo;
}
double mm_event_identity(const mm_reg1_t *r)
{
int32_t n_gap, n_gapo;
if (r->p == 0) return -1.0f;
mm_count_gaps(r, &n_gap, &n_gapo);
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
}
static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
{
uint32_t i;
int32_t n_gap = 0, n_gapo = 0, n_mis;
double gap_cost = 0.0;
if (r->p == 0) return -1;
for (i = 0; i < r->p->n_cigar; ++i) {
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
gap_cost += b2 + (double)mg_log2(1.0 + len);
++n_gapo, n_gap += len;
}
}
n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499);
}
static void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b)
{
int32_t max = -1, max2 = -1, i, max_i = -1;
double div, b2;
if (n_regs < 2) return;
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = &regs[i];
if (r->p == 0) continue;
if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i;
else if (r->p->dp_max > max2) max2 = r->p->dp_max;
}
if (max_i < 0 || max < 0 || max2 < 0) return;
if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return;
if (max2 < (double)max * frac) return;
div = 1. - mm_event_identity(&regs[max_i]);
if (div < 0.02) div = 0.02;
b2 = 0.5 / div; // max value: 25
if (b2 * a < b) b2 = (double)a / b;
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = &regs[i];
if (r->p == 0) continue;
r->p->dp_max = mm_recal_max_dp(r, b2, a);
}
}
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
{
extern unsigned char seq_nt4_table[256];
@ -947,6 +1010,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
kfree(km, qseq0[0]);
kfree(km, ez.cigar);
mm_filter_regs(opt, qlen, n_regs_, regs);
if (!(opt->flag&MM_F_SR) && qlen >= opt->rank_min_len)
mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b);
mm_hit_sort(km, n_regs_, regs, opt->alt_drop);
return regs;
}

View File

@ -271,18 +271,6 @@ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_r
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0, 0);
}
double mm_event_identity(const mm_reg1_t *r)
{
int32_t i, n_gapo = 0, n_gap = 0;
if (r->p == 0) return -1.0f;
for (i = 0; i < r->p->n_cigar; ++i) {
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
++n_gapo, n_gap += len;
}
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
}
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
{
int type;

6
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.21-dev-r1078-dirty"
#define MM_VERSION "2.21-dev-r1079-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -116,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:b:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
@ -182,7 +182,6 @@ int main(int argc, char *argv[])
else if (c == 'R') rg = o.arg;
else if (c == 'h') fp_help = stdout;
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
else if (c == 'b') opt.b2 = atoi(o.arg);
else if (c == 'o') {
if (strcmp(o.arg, "-") != 0) {
if (freopen(o.arg, "wb", stdout) == NULL) {
@ -332,7 +331,6 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b);
fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
fprintf(fp_help, " -b INT ranking penalty (independent of A/B/O/E; 0 to disable) [%d]\n", opt.b2);
fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv);
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");

View File

@ -147,7 +147,6 @@ typedef struct {
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int b2; // used for re-ranking hits
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;
@ -158,6 +157,9 @@ typedef struct {
int anchor_ext_len, anchor_ext_shift;
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
int rank_min_len;
float rank_frac;
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below

View File

@ -62,6 +62,7 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
double mm_event_identity(const mm_reg1_t *r);
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);

View File

@ -43,7 +43,6 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->alt_drop = 0.15f;
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->b2 = 5;
opt->sc_ambi = 1;
opt->zdrop = 400, opt->zdrop_inv = 200;
opt->end_bonus = -1;
@ -54,6 +53,9 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->mini_batch_size = 500000000;
opt->max_sw_mat = 100000000;
opt->rank_min_len = 500;
opt->rank_frac = 0.9f;
opt->pe_ori = 0; // FF
opt->pe_bonus = 33;
}
@ -69,11 +71,6 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
opt->mid_occ = opt->max_mid_occ;
}
if (opt->b2 > 0 && opt->b2 * opt->a < opt->b) {
opt->b2 = (int)opt->b / opt->a;
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s] addjusted the ranking penalty to %d\n", __func__, opt->b2);
}
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
}
@ -131,7 +128,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
io->flag = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
mo->pe_ori = 0<<1|1; // FR
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1, mo->b2 = 0;
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
mo->zdrop = mo->zdrop_inv = 100;
mo->end_bonus = 10;
mo->max_frag_len = 800;
@ -150,7 +147,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
mo->max_sw_mat = 0;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0, mo->b2 = 0;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
mo->noncan = 9;
mo->junc_bonus = 9;
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved