r499: end bonus working; DP region needs improve!

This commit is contained in:
Heng Li 2017-10-11 00:14:25 -04:00
parent ca632f907b
commit 7345621759
7 changed files with 45 additions and 34 deletions

36
align.c
View File

@ -186,7 +186,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
}
}
static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez)
static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int flag, ksw_extz_t *ez)
{
int zdrop = opt->zdrop;
if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
@ -202,7 +202,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
else if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, flag, ez);
else
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, flag, ez);
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);
}
static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x)
@ -378,12 +378,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
rs0 = (int32_t)a[r->as].x + 1 - (int32_t)(a[r->as].y>>32&0xff);
qs0 = (int32_t)a[r->as].y + 1 - (int32_t)(a[r->as].y>>32&0xff);
if (r->as > 0 && a[r->as - 1].x>>32 == a[r->as].x>>32) {
rs1 = (int32_t)a[r->as - 1].x + 1;
qs1 = (int32_t)a[r->as - 1].y + 1;
} else rs1 = qs1 = 0; // no adjacent previous chain on the same chr
rs1 = (int32_t)a[r->as - 1].x + 1 - (int32_t)(a[r->as - 1].y>>32&0xff);
} else rs1 = 0; // no adjacent previous chain on the same chr
if (qs > 0 && rs > 0) { // actually this is always true
l = qs < max_end_ext? qs : max_end_ext;
qs1 = qs1 > qs - l? qs1 : qs - l; // choose between the max_end_ext bound and the previous seed bound
qs1 = qs - l;
qs0 = qs0 < qs1? qs0 : qs1; // at least include qs0
l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
l = l < max_end_ext? l : max_end_ext;
@ -396,12 +395,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
re0 = (int32_t)a[r->as + r->cnt - 1].x + 1;
qe0 = (int32_t)a[r->as + r->cnt - 1].y + 1;
if (r->as + r->cnt < n_a && a[r->as + r->cnt].x>>32 == a[r->as + r->cnt - 1].x>>32) {
re1 = (int32_t)a[r->as + r->cnt].x + 1 - (int32_t)(a[r->as + r->cnt].y>>32&0xff);
qe1 = (int32_t)a[r->as + r->cnt].y + 1 - (int32_t)(a[r->as + r->cnt].y>>32&0xff);
} else re1 = mi->seq[rid].len, qe1 = qlen; // no adjacent next chain on the same chr
re1 = (int32_t)a[r->as + r->cnt].x + 1;
} else re1 = mi->seq[rid].len; // no adjacent next chain on the same chr
if (qe < qlen && re < mi->seq[rid].len) {
l = qlen - qe < max_end_ext? qlen - qe : max_end_ext;
qe1 = qe1 < qe + l? qe1 : qe + l; // choose between the max_end_ext bound and the next seed bound
qe1 = qe + l;
qe0 = qe0 > qe1? qe0 : qe1; // at least include qe0
l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
l = l < max_end_ext? l : max_end_ext;
@ -418,13 +416,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_idx_getseq(mi, rid, rs0, rs, tseq);
mm_seq_rev(qs - qs0, qseq);
mm_seq_rev(rs - rs0, tseq);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, opt->end_bonus, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
r->p->dp_score += ez->max;
}
rs1 = rs - (ez->max_t + 1);
qs1 = qs - (ez->max_q + 1);
rs1 = rs - (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
qs1 = qs - (ez->reach_end? qs - qs0 : ez->max_q + 1);
mm_seq_rev(qs - qs0, qseq);
} else rs1 = rs, qs1 = qs;
re1 = rs, qe1 = qs;
@ -452,10 +450,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
}
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs);
} else { // perform normal gapped alignment
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
}
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez); // second pass: lift approximate
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, extra_flag, ez); // second pass: lift approximate
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
@ -477,13 +475,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (!dropped && qe < qe0 && re < re0) { // right extension
qseq = &qseq0[rev][qe];
mm_idx_getseq(mi, rid, re, re0, tseq);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, opt->end_bonus, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
r->p->dp_score += ez->max;
}
re1 = re + (ez->max_t + 1);
qe1 = qe + (ez->max_q + 1);
re1 = re + (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
qe1 = qe + (ez->reach_end? qe0 - qe : ez->max_q + 1);
}
assert(qe1 <= qlen);
@ -533,7 +531,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
mm_seq_rev(tl, tseq);
if (score < opt->min_dp_max) goto end_align1_inv;
q_off = ql - (q_off + 1), t_off = tl - (t_off + 1);
mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), KSW_EZ_EXTZ_ONLY, ez);
mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), -1, KSW_EZ_EXTZ_ONLY, ez);
if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
r_inv->p->dp_score = ez->max;

12
ksw2.h
View File

@ -26,6 +26,7 @@ typedef struct {
int mte, mte_q; // max score when reaching the end of target
int score; // max score reaching both ends; may be KSW_NEG_INF
int m_cigar, n_cigar;
int reach_end;
uint32_t *cigar;
} ksw_extz_t;
@ -46,14 +47,17 @@ typedef struct {
* @param flag flag (see KSW_EZ_* macros)
* @param ez (out) scores and cigar
*/
void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
@ -149,7 +153,7 @@ static inline void ksw_reset_extz(ksw_extz_t *ez)
{
ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF;
ez->n_cigar = 0, ez->zdropped = 0;
ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0;
}
static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e)

View File

@ -64,18 +64,18 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
}
void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez)
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
{
extern void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez);
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
extern void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez);
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
unsigned simd;
simd = x86_simd();
if (simd & SIMD_SSE4_1)
ksw_extd2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez);
ksw_extd2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, end_bonus, flag, ez);
else if (simd & SIMD_SSE2)
ksw_extd2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez);
ksw_extd2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, end_bonus, flag, ez);
else abort();
}

View File

@ -17,14 +17,14 @@
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__
void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez)
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
#else
void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez)
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
#endif
#else
void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez)
int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
#endif // ~KSW_CPU_DISPATCH
{
#define __dp_code_block1 \
@ -380,10 +380,14 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (!approx_max) kfree(km, H);
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) {
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
} else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > ez->max) {
ez->reach_end = 1;
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
} else if (ez->max_t >= 0 && ez->max_q >= 0) {
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
}
kfree(km, mem2); kfree(km, off);
}
}

4
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.2-r498-dirty"
#define MM_VERSION "2.2-r499-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -40,6 +40,7 @@ static struct option long_options[] = {
{ "frag", optional_argument, 0, 0 },
{ "print-2nd", optional_argument, 0, 0 },
{ "cs", optional_argument, 0, 0 },
{ "end-bonus", required_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -130,6 +131,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus
else if (c == 0 && long_idx == 14) { // --frag
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag |= MM_F_FRAG_MODE;

2
map.c
View File

@ -33,6 +33,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->zdrop = 400;
opt->end_bonus = -1;
opt->min_dp_max = opt->min_chain_score * opt->a;
opt->min_ksw_len = 200;
opt->mini_batch_size = 200000000;
@ -86,6 +87,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->pe_ori = 0<<1|1; // FR
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1;
mo->zdrop = 100;
mo->end_bonus = 11;
mo->max_frag_len = 800;
mo->max_gap = 100;
mo->bw = 100;

View File

@ -104,6 +104,7 @@ typedef struct {
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int noncan; // cost of non-canonical splicing sites
int zdrop; // break alignment if alignment score drops too fast along the diagonal
int end_bonus;
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
int min_ksw_len;