dev-r1078: decoupling ranking penalty
This commit is contained in:
parent
15118dd521
commit
8a6edab847
13
align.c
13
align.c
|
|
@ -237,7 +237,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
|
|||
r->p = p;
|
||||
}
|
||||
|
||||
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap)
|
||||
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int b2)
|
||||
{
|
||||
uint32_t k, l;
|
||||
int32_t qshift, tshift, toff = 0, qoff = 0;
|
||||
|
|
@ -255,7 +255,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
int cq = qseq[qoff + l], ct = tseq[toff + l];
|
||||
if (ct > 3 || cq > 3) ++n_ambi;
|
||||
else if (ct != cq) ++n_diff;
|
||||
s += mat[ct * 5 + cq];
|
||||
s += b2 <= 0? mat[ct * 5 + cq] : (ct > 3 || cq > 3)? 0 : ct == cq? 1 : -b2;
|
||||
if (s < 0) s = 0;
|
||||
else max = max > s? max : s;
|
||||
}
|
||||
|
|
@ -266,7 +266,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
for (l = 0; l < len; ++l)
|
||||
if (qseq[qoff + l] > 3) ++n_ambi;
|
||||
r->blen += len - n_ambi, p->n_ambi += n_ambi;
|
||||
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
|
||||
if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
|
||||
else s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
qoff += len;
|
||||
|
|
@ -275,7 +275,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
for (l = 0; l < len; ++l)
|
||||
if (tseq[toff + l] > 3) ++n_ambi;
|
||||
r->blen += len - n_ambi, p->n_ambi += n_ambi;
|
||||
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
|
||||
if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
|
||||
else s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
toff += len;
|
||||
|
|
@ -284,6 +284,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
}
|
||||
}
|
||||
p->dp_max = (int32_t)(max + .499);
|
||||
if (b2 > 0) p->dp_max *= mat[0]; // for compatibility with mm_set_mapq()
|
||||
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
||||
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
||||
}
|
||||
|
|
@ -816,7 +817,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
mm_idx_getseq(mi, rid, rs1, re1, tseq);
|
||||
qseq = &qseq0[r->rev][qs1];
|
||||
}
|
||||
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag&MM_F_NO_LOG_GAP));
|
||||
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
|
||||
if (rev && r->p->trans_strand)
|
||||
r->p->trans_strand ^= 3; // flip to the read strand
|
||||
}
|
||||
|
|
@ -875,7 +876,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|||
}
|
||||
r_inv->rs = r1->re + t_off;
|
||||
r_inv->re = r_inv->rs + ez->max_t + 1;
|
||||
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag&MM_F_NO_LOG_GAP));
|
||||
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
|
||||
ret = 1;
|
||||
end_align1_inv:
|
||||
kfree(km, tseq);
|
||||
|
|
|
|||
11
main.c
11
main.c
|
|
@ -7,7 +7,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "ketopt.h"
|
||||
|
||||
#define MM_VERSION "2.21-dev-r1076-dirty"
|
||||
#define MM_VERSION "2.21-dev-r1078-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -73,7 +73,6 @@ static ko_longopt_t long_options[] = {
|
|||
{ "mask-len", ko_required_argument, 346 },
|
||||
{ "rmq", ko_optional_argument, 347 },
|
||||
{ "qstrand", ko_no_argument, 348 },
|
||||
{ "log-gap", ko_required_argument, 349 },
|
||||
{ "help", ko_no_argument, 'h' },
|
||||
{ "max-intron-len", ko_required_argument, 'G' },
|
||||
{ "version", ko_no_argument, 'V' },
|
||||
|
|
@ -117,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:b:";
|
||||
ketopt_t o = KETOPT_INIT;
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
|
|
@ -183,6 +182,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'R') rg = o.arg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
|
||||
else if (c == 'b') opt.b2 = atoi(o.arg);
|
||||
else if (c == 'o') {
|
||||
if (strcmp(o.arg, "-") != 0) {
|
||||
if (freopen(o.arg, "wb", stdout) == NULL) {
|
||||
|
|
@ -253,8 +253,6 @@ int main(int argc, char *argv[])
|
|||
yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0);
|
||||
} else if (c == 347) { // --rmq
|
||||
yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
|
||||
} else if (c == 349) { // --log-gap
|
||||
yes_or_no(&opt, MM_F_NO_LOG_GAP, o.longidx, o.arg, 0);
|
||||
} else if (c == 'S') {
|
||||
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
|
||||
if (mm_verbose >= 2)
|
||||
|
|
@ -331,9 +329,10 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
|
||||
fprintf(fp_help, " Alignment:\n");
|
||||
fprintf(fp_help, " -A INT matching score [%d]\n", opt.a);
|
||||
fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b);
|
||||
fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b);
|
||||
fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
|
||||
fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
|
||||
fprintf(fp_help, " -b INT ranking penalty (independent of A/B/O/E; 0 to disable) [%d]\n", opt.b2);
|
||||
fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv);
|
||||
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
|
||||
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@
|
|||
#define MM_F_RMQ (0x80000000LL)
|
||||
#define MM_F_QSTRAND (0x100000000LL)
|
||||
#define MM_F_NO_INV (0x200000000LL)
|
||||
#define MM_F_NO_LOG_GAP (0x400000000LL)
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
|
|
@ -148,6 +147,7 @@ typedef struct {
|
|||
float alt_drop;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int b2; // used for re-ranking hits
|
||||
int sc_ambi; // score when one or both bases are "N"
|
||||
int noncan; // cost of non-canonical splicing sites
|
||||
int junc_bonus;
|
||||
|
|
|
|||
|
|
@ -573,7 +573,7 @@ Up to 20% sequence divergence.
|
|||
.B splice
|
||||
Long-read spliced alignment
|
||||
.RB ( -k15
|
||||
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
|
||||
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
|
||||
.BR --splice-flank=yes ).
|
||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||
the
|
||||
|
|
@ -592,7 +592,7 @@ Long-read splice alignment for PacBio CCS reads
|
|||
.B sr
|
||||
Short single-end reads without splicing
|
||||
.RB ( -k21
|
||||
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
|
||||
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
|
||||
.B -s40 -g100 -2K50m --heap-sort=yes
|
||||
.BR --secondary=no ).
|
||||
.TP
|
||||
|
|
|
|||
10
options.c
10
options.c
|
|
@ -43,6 +43,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->alt_drop = 0.15f;
|
||||
|
||||
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
||||
opt->b2 = 5;
|
||||
opt->sc_ambi = 1;
|
||||
opt->zdrop = 400, opt->zdrop_inv = 200;
|
||||
opt->end_bonus = -1;
|
||||
|
|
@ -68,6 +69,11 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
|||
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
|
||||
opt->mid_occ = opt->max_mid_occ;
|
||||
}
|
||||
if (opt->b2 > 0 && opt->b2 * opt->a < opt->b) {
|
||||
opt->b2 = (int)opt->b / opt->a;
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] addjusted the ranking penalty to %d\n", __func__, opt->b2);
|
||||
}
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
|
||||
}
|
||||
|
|
@ -125,7 +131,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
io->flag = 0, io->k = 21, io->w = 11;
|
||||
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
|
||||
mo->pe_ori = 0<<1|1; // FR
|
||||
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
|
||||
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1, mo->b2 = 0;
|
||||
mo->zdrop = mo->zdrop_inv = 100;
|
||||
mo->end_bonus = 10;
|
||||
mo->max_frag_len = 800;
|
||||
|
|
@ -144,7 +150,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
|
||||
mo->max_sw_mat = 0;
|
||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0, mo->b2 = 0;
|
||||
mo->noncan = 9;
|
||||
mo->junc_bonus = 9;
|
||||
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
|
||||
|
|
|
|||
Loading…
Reference in New Issue