diff --git a/align.c b/align.c index 1bd316b..264b23d 100644 --- a/align.c +++ b/align.c @@ -28,39 +28,63 @@ static inline void mm_seq_rev(uint32_t len, uint8_t *seq) t = seq[i], seq[i] = seq[len - 1 - i], seq[len - 1 - i] = t; } -static inline int test_zdrop_aux(int32_t score, int i, int j, int32_t *max, int *max_i, int *max_j, int e, int zdrop) +static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, int *max_i, int *max_j, int e, int *max_zdrop, int pos[2][2]) { if (score < *max) { int li = i - *max_i; int lj = j - *max_j; int diff = li > lj? li - lj : lj - li; - if (*max - score > zdrop + diff * e) - return 1; + int z = *max - score - diff * e; + if (z > *max_zdrop) { + *max_zdrop = z; + pos[0][0] = *max_i, pos[0][1] = i + 1; + pos[1][0] = *max_j, pos[1][1] = j + 1; + } } else *max = score, *max_i = i, *max_j = j; - return 0; } -static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, const int8_t *mat, int8_t q, int8_t e, int zdrop) +static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, const int8_t *mat) { uint32_t k; - int32_t score = 0, max = 0, max_i = -1, max_j = -1, i = 0, j = 0; - for (k = 0; k < n_cigar; ++k) { + int32_t score = 0, max = 0, max_i = -1, max_j = -1, i = 0, j = 0, max_zdrop = 0; + int pos[2][2] = {{-1, -1}, {-1, -1}}, q_len, t_len; + + // find the score and the region where score drops most along diagonal + for (k = 0, score = 0; k < n_cigar; ++k) { uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4; if (op == 0) { for (l = 0; l < len; ++l) { score += mat[tseq[i + l] * 5 + qseq[j + l]]; - if (test_zdrop_aux(score, i+l, j+l, &max, &max_i, &max_j, e, zdrop)) return 1; + update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } i += len, j += len; - } else if (op == 1) { - score -= q + e * len, j += len; - if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1; - } else if (op == 2 || op == 3) { - score -= q + e * len, i += len; - if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1; + } else if (op == 1 || op == 2 || op == 3) { + score -= opt->q + opt->e * len; + if (op == 1) j += len; // insertion + else i += len; // deletion + update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } } - return 0; + + // test if there is an inversion in the most dropped region + q_len = pos[1][1] - pos[1][0], t_len = pos[0][1] - pos[0][0]; + if (!(opt->flag&(MM_F_SPLICE|MM_F_SR|MM_F_FOR_ONLY|MM_F_REV_ONLY)) && max_zdrop > opt->zdrop_inv && q_len < opt->max_gap && t_len < opt->max_gap) { + uint8_t *qseq2; + void *qp; + int q_off, t_off; + qseq2 = (uint8_t*)kmalloc(km, q_len); + for (i = 0; i < q_len; ++i) { + int c = qseq[pos[1][1] - i - 1]; + qseq2[i] = c >= 4? 4 : 3 - c; + } + qp = ksw_ll_qinit(km, 2, q_len, qseq2, 5, mat); + score = ksw_ll_i16(qp, t_len, tseq + pos[0][0], opt->q, opt->e, &q_off, &t_off); + kfree(km, qseq2); + kfree(km, qp); + if (score >= opt->min_chain_score * opt->a && score >= opt->min_dp_max) + return 2; // there is a potential inversion + } + return max_zdrop > opt->zdrop? 1 : 0; } static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift) @@ -193,9 +217,8 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // } } -static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int flag, ksw_extz_t *ez) +static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez) { - int zdrop = opt->zdrop; if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { int i; fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop); @@ -522,7 +545,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs0, rs, tseq); mm_seq_rev(qs - qs0, qseq); mm_seq_rev(rs - rs0, tseq); - mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, opt->end_bonus, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); + mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; @@ -542,9 +565,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe); re1 = re, qe1 = qe; if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) { - int j, bw1 = bw; + int j, bw1 = bw, zdrop_code; if (a[as1+i].y & MM_SEED_LONG_JOIN) bw1 = qe - qs > re - rs? qe - qs : re - rs; + // perform alignment qseq = &qseq0[rev][qs]; mm_idx_getseq(mi, rid, rs, re, tseq); if (is_sr) { // perform ungapped alignment @@ -556,10 +580,12 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs); } else { // perform normal gapped alignment - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop } - if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, extra_flag, ez); // second pass: lift approximate + // test Z-drop and inversion Z-drop + if ((zdrop_code = mm_test_zdrop(km, opt, qseq, tseq, ez->n_cigar, ez->cigar, mat)) != 0) + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate + // update CIGAR if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. @@ -571,8 +597,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int r->p->dp_score += ez->max; re1 = rs + (ez->max_t + 1); qe1 = qs + (ez->max_q + 1); - if (cnt1 - (j + 1) >= opt->min_cnt) + if (cnt1 - (j + 1) >= opt->min_cnt) { mm_split_reg(r, r2, as1 + j + 1 - r->as, qlen, a); + if (zdrop_code == 2) r2->split_inv = 1; + } break; } else r->p->dp_score += ez->score; rs = re, qs = qe; @@ -582,7 +610,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (!dropped && qe < qe0 && re < re0) { // right extension qseq = &qseq0[rev][qe]; mm_idx_getseq(mi, rid, re, re0, tseq); - mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, opt->end_bonus, extra_flag|KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; @@ -638,7 +666,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i mm_seq_rev(tl, tseq); if (score < opt->min_dp_max) goto end_align1_inv; q_off = ql - (q_off + 1), t_off = tl - (t_off + 1); - mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), -1, KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; @@ -648,8 +676,10 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i r_inv->rev = !r1->rev; r_inv->rid = r1->rid; r_inv->div = -1.0f; - r_inv->qs = r1->qe + q_off, r_inv->qe = r_inv->qs + ez->max_q + 1; - r_inv->rs = r1->re + t_off, r_inv->re = r_inv->rs + ez->max_t + 1; + r_inv->qs = r1->rev? r2->qe + q_off : r1->qe + q_off; + r_inv->qe = r_inv->qs + ez->max_q + 1; + r_inv->rs = r1->re + t_off; + r_inv->re = r_inv->rs + ez->max_t + 1; mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e); ret = 1; end_align1_inv: @@ -710,7 +740,7 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2; } if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs); - if (!(opt->flag&(MM_F_SPLICE|MM_F_SR)) && !(opt->flag&(MM_F_FOR_ONLY|MM_F_REV_ONLY)) && i > 0) { // don't try inversion alignment for -xsplice or -xsr, or --for-only/rev-only + if (i > 0 && regs[i].split_inv) { if (mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) { regs = mm_insert_reg(&r2, i, &n_regs, regs); ++i; // skip the inserted INV alignment diff --git a/hit.c b/hit.c index f427654..33f9f3f 100644 --- a/hit.c +++ b/hit.c @@ -94,6 +94,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a) r2->id = -1; r2->sam_pri = 0; r2->p = 0; + r2->split_inv = 0; r2->cnt = r->cnt - n; r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499); r2->as = r->as + n; diff --git a/main.c b/main.c index 9421279..6181e34 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.8-r707-dirty" +#define MM_VERSION "2.8-r708-dirty" #ifdef __linux__ #include @@ -139,7 +139,6 @@ int main(int argc, char *argv[]) else if (c == 'm') opt.min_chain_score = atoi(optarg); else if (c == 'A') opt.a = atoi(optarg); else if (c == 'B') opt.b = atoi(optarg); - else if (c == 'z') opt.zdrop = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 'C') opt.noncan = atoi(optarg); else if (c == 'I') ipt.batch_size = mm_parse_num(optarg); @@ -209,6 +208,9 @@ int main(int argc, char *argv[]) fprintf(stderr, "[ERROR]\033[1;31m unrecognized cDNA direction\033[0m\n"); return 1; } + } else if (c == 'z') { + opt.zdrop = opt.zdrop_inv = strtol(optarg, &s, 10); + if (*s == ',') opt.zdrop_inv = strtol(s + 1, &s, 10); } else if (c == 'O') { opt.q = opt.q2 = strtol(optarg, &s, 10); if (*s == ',') opt.q2 = strtol(s + 1, &s, 10); @@ -252,7 +254,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b); fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); - fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop); + fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(fp_help, " Input/Output:\n"); diff --git a/minimap.h b/minimap.h index b6b9a00..f4bde58 100644 --- a/minimap.h +++ b/minimap.h @@ -82,7 +82,7 @@ typedef struct { int32_t mlen, blen; // seeded exact match length; seeded alignment block length int32_t n_sub; // number of suboptimal mappings int32_t score0; // initial chaining score (before chain merging/spliting) - uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, dummy:8; + uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, dummy:7; uint32_t hash; float div; mm_extra_t *p; @@ -116,7 +116,7 @@ typedef struct { int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties int noncan; // cost of non-canonical splicing sites - int zdrop; // break alignment if alignment score drops too fast along the diagonal + int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal int end_bonus; int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold int min_ksw_len; diff --git a/options.c b/options.c index 9e4276c..685fff2 100644 --- a/options.c +++ b/options.c @@ -24,7 +24,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_join_flank_sc = 1000; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; - opt->zdrop = 400; + opt->zdrop = 400, opt->zdrop_inv = 200; opt->end_bonus = -1; opt->min_dp_max = opt->min_chain_score * opt->a; opt->min_ksw_len = 200; @@ -71,12 +71,12 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 15; } else if (strcmp(preset, "asm5") == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200; + mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; mo->min_dp_max = 200; mo->best_n = 50; } else if (strcmp(preset, "asm10") == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200; + mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; mo->min_dp_max = 200; mo->best_n = 50; } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { @@ -84,7 +84,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; mo->pe_ori = 0<<1|1; // FR mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; - mo->zdrop = 100; + mo->zdrop = mo->zdrop_inv = 100; mo->end_bonus = 10; mo->max_frag_len = 800; mo->max_gap = 100; @@ -103,7 +103,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; - mo->zdrop = 200; + mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved } else return -1; return 0; } @@ -137,5 +137,10 @@ int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) fprintf(stderr, "[ERROR]\033[1;31m scoring system violating ({-O}+{-E})+({-O2}+{-E2}) <= 127\033[0m\n"); return -1; } + if (mo->zdrop < mo->zdrop_inv) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m Z-drop should not be less than inversion-Z-drop\033[0m\n"); + return -5; + } return 0; } diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 5295313..4891f98 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -26,7 +26,7 @@ cdef extern from "minimap.h": int min_join_flank_sc int a, b, q, e, q2, e2 int noncan - int zdrop + int zdrop, zdrop_inv int end_bonus int min_dp_max int min_ksw_len