dev-r1078: decoupling ranking penalty

This commit is contained in:
Heng Li 2021-07-18 16:22:48 -04:00
parent 15118dd521
commit 8a6edab847
5 changed files with 23 additions and 17 deletions

13
align.c
View File

@ -237,7 +237,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
r->p = p; r->p = p;
} }
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap) static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int b2)
{ {
uint32_t k, l; uint32_t k, l;
int32_t qshift, tshift, toff = 0, qoff = 0; int32_t qshift, tshift, toff = 0, qoff = 0;
@ -255,7 +255,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
int cq = qseq[qoff + l], ct = tseq[toff + l]; int cq = qseq[qoff + l], ct = tseq[toff + l];
if (ct > 3 || cq > 3) ++n_ambi; if (ct > 3 || cq > 3) ++n_ambi;
else if (ct != cq) ++n_diff; else if (ct != cq) ++n_diff;
s += mat[ct * 5 + cq]; s += b2 <= 0? mat[ct * 5 + cq] : (ct > 3 || cq > 3)? 0 : ct == cq? 1 : -b2;
if (s < 0) s = 0; if (s < 0) s = 0;
else max = max > s? max : s; else max = max > s? max : s;
} }
@ -266,7 +266,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
for (l = 0; l < len; ++l) for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi; if (qseq[qoff + l] > 3) ++n_ambi;
r->blen += len - n_ambi, p->n_ambi += n_ambi; r->blen += len - n_ambi, p->n_ambi += n_ambi;
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
else s -= q + e * len; else s -= q + e * len;
if (s < 0) s = 0; if (s < 0) s = 0;
qoff += len; qoff += len;
@ -275,7 +275,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
for (l = 0; l < len; ++l) for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi; if (tseq[toff + l] > 3) ++n_ambi;
r->blen += len - n_ambi, p->n_ambi += n_ambi; r->blen += len - n_ambi, p->n_ambi += n_ambi;
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); if (b2 > 0) s -= b2 + (double)mg_log2(1.0 + len);
else s -= q + e * len; else s -= q + e * len;
if (s < 0) s = 0; if (s < 0) s = 0;
toff += len; toff += len;
@ -284,6 +284,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
} }
} }
p->dp_max = (int32_t)(max + .499); p->dp_max = (int32_t)(max + .499);
if (b2 > 0) p->dp_max *= mat[0]; // for compatibility with mm_set_mapq()
assert(qoff == r->qe - r->qs && toff == r->re - r->rs); assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
} }
@ -816,7 +817,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_idx_getseq(mi, rid, rs1, re1, tseq); mm_idx_getseq(mi, rid, rs1, re1, tseq);
qseq = &qseq0[r->rev][qs1]; qseq = &qseq0[r->rev][qs1];
} }
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag&MM_F_NO_LOG_GAP)); mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
if (rev && r->p->trans_strand) if (rev && r->p->trans_strand)
r->p->trans_strand ^= 3; // flip to the read strand r->p->trans_strand ^= 3; // flip to the read strand
} }
@ -875,7 +876,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
} }
r_inv->rs = r1->re + t_off; r_inv->rs = r1->re + t_off;
r_inv->re = r_inv->rs + ez->max_t + 1; r_inv->re = r_inv->rs + ez->max_t + 1;
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag&MM_F_NO_LOG_GAP)); mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, opt->b2);
ret = 1; ret = 1;
end_align1_inv: end_align1_inv:
kfree(km, tseq); kfree(km, tseq);

11
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h" #include "mmpriv.h"
#include "ketopt.h" #include "ketopt.h"
#define MM_VERSION "2.21-dev-r1076-dirty" #define MM_VERSION "2.21-dev-r1078-dirty"
#ifdef __linux__ #ifdef __linux__
#include <sys/resource.h> #include <sys/resource.h>
@ -73,7 +73,6 @@ static ko_longopt_t long_options[] = {
{ "mask-len", ko_required_argument, 346 }, { "mask-len", ko_required_argument, 346 },
{ "rmq", ko_optional_argument, 347 }, { "rmq", ko_optional_argument, 347 },
{ "qstrand", ko_no_argument, 348 }, { "qstrand", ko_no_argument, 348 },
{ "log-gap", ko_required_argument, 349 },
{ "help", ko_no_argument, 'h' }, { "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' }, { "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' }, { "version", ko_no_argument, 'V' },
@ -117,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:"; const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:b:";
ketopt_t o = KETOPT_INIT; ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt; mm_mapopt_t opt;
mm_idxopt_t ipt; mm_idxopt_t ipt;
@ -183,6 +182,7 @@ int main(int argc, char *argv[])
else if (c == 'R') rg = o.arg; else if (c == 'R') rg = o.arg;
else if (c == 'h') fp_help = stdout; else if (c == 'h') fp_help = stdout;
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
else if (c == 'b') opt.b2 = atoi(o.arg);
else if (c == 'o') { else if (c == 'o') {
if (strcmp(o.arg, "-") != 0) { if (strcmp(o.arg, "-") != 0) {
if (freopen(o.arg, "wb", stdout) == NULL) { if (freopen(o.arg, "wb", stdout) == NULL) {
@ -253,8 +253,6 @@ int main(int argc, char *argv[])
yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0);
} else if (c == 347) { // --rmq } else if (c == 347) { // --rmq
yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
} else if (c == 349) { // --log-gap
yes_or_no(&opt, MM_F_NO_LOG_GAP, o.longidx, o.arg, 0);
} else if (c == 'S') { } else if (c == 'S') {
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
if (mm_verbose >= 2) if (mm_verbose >= 2)
@ -331,9 +329,10 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
fprintf(fp_help, " Alignment:\n"); fprintf(fp_help, " Alignment:\n");
fprintf(fp_help, " -A INT matching score [%d]\n", opt.a); fprintf(fp_help, " -A INT matching score [%d]\n", opt.a);
fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b); fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b);
fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
fprintf(fp_help, " -b INT ranking penalty (independent of A/B/O/E; 0 to disable) [%d]\n", opt.b2);
fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv); fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv);
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");

View File

@ -39,7 +39,6 @@
#define MM_F_RMQ (0x80000000LL) #define MM_F_RMQ (0x80000000LL)
#define MM_F_QSTRAND (0x100000000LL) #define MM_F_QSTRAND (0x100000000LL)
#define MM_F_NO_INV (0x200000000LL) #define MM_F_NO_INV (0x200000000LL)
#define MM_F_NO_LOG_GAP (0x400000000LL)
#define MM_I_HPC 0x1 #define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2 #define MM_I_NO_SEQ 0x2
@ -148,6 +147,7 @@ typedef struct {
float alt_drop; float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int b2; // used for re-ranking hits
int sc_ambi; // score when one or both bases are "N" int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites int noncan; // cost of non-canonical splicing sites
int junc_bonus; int junc_bonus;

View File

@ -573,7 +573,7 @@ Up to 20% sequence divergence.
.B splice .B splice
Long-read spliced alignment Long-read spliced alignment
.RB ( -k15 .RB ( -k15
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.BR --splice-flank=yes ). .BR --splice-flank=yes ).
In the splice mode, 1) long deletions are taken as introns and represented as In the splice mode, 1) long deletions are taken as introns and represented as
the the
@ -592,7 +592,7 @@ Long-read splice alignment for PacBio CCS reads
.B sr .B sr
Short single-end reads without splicing Short single-end reads without splicing
.RB ( -k21 .RB ( -k21
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
.B -s40 -g100 -2K50m --heap-sort=yes .B -s40 -g100 -2K50m --heap-sort=yes
.BR --secondary=no ). .BR --secondary=no ).
.TP .TP

View File

@ -43,6 +43,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->alt_drop = 0.15f; opt->alt_drop = 0.15f;
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->b2 = 5;
opt->sc_ambi = 1; opt->sc_ambi = 1;
opt->zdrop = 400, opt->zdrop_inv = 200; opt->zdrop = 400, opt->zdrop_inv = 200;
opt->end_bonus = -1; opt->end_bonus = -1;
@ -68,6 +69,11 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
opt->mid_occ = opt->max_mid_occ; opt->mid_occ = opt->max_mid_occ;
} }
if (opt->b2 > 0 && opt->b2 * opt->a < opt->b) {
opt->b2 = (int)opt->b / opt->a;
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s] addjusted the ranking penalty to %d\n", __func__, opt->b2);
}
if (mm_verbose >= 3) if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
} }
@ -125,7 +131,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
io->flag = 0, io->k = 21, io->w = 11; io->flag = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
mo->pe_ori = 0<<1|1; // FR mo->pe_ori = 0<<1|1; // FR
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1, mo->b2 = 0;
mo->zdrop = mo->zdrop_inv = 100; mo->zdrop = mo->zdrop_inv = 100;
mo->end_bonus = 10; mo->end_bonus = 10;
mo->max_frag_len = 800; mo->max_frag_len = 800;
@ -144,7 +150,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
mo->max_sw_mat = 0; mo->max_sw_mat = 0;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0, mo->b2 = 0;
mo->noncan = 9; mo->noncan = 9;
mo->junc_bonus = 9; mo->junc_bonus = 9;
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved