diff --git a/Makefile b/Makefile index e354af5..a415f12 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat CPPFLAGS= -DHAVE_KALLOC INCLUDES= -I. -OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o misc.o bseq.o \ +OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_ll_sse.o misc.o bseq.o \ sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite diff --git a/align.c b/align.c index ba2cf62..8945af4 100644 --- a/align.c +++ b/align.c @@ -357,9 +357,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) { - int tl, ql, ret = 0; + int tl, ql, score, ret = 0, q_off, t_off; uint8_t *tseq, *qseq; int8_t mat[25]; + void *qp; + memset(r_inv, 0, sizeof(mm_reg1_t)); if (!(r1->split&1) || !(r2->split&2)) return 0; if (r1->id != r1->parent && r1->parent != MM_PARENT_TMP_PRI) return 0; @@ -369,28 +371,34 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i tl = r2->rs - r1->re; if (ql < 0 || ql > opt->max_gap) return 0; if (tl < 0 || tl > opt->max_gap) return 0; + ksw_gen_simple_mat(5, mat, opt->a, opt->b); tseq = (uint8_t*)kmalloc(km, tl); mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq); qseq = &qseq0[!r1->rev][qlen - r2->qs]; - mm_align_pair(km, opt, ql, qseq, tl, tseq, mat, (int)(opt->bw * 1.5), KSW_EZ_APPROX_MAX, ez); - if (ez->n_cigar > 0 && ez->score > 0) { - mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); - r_inv->p->dp_score = ez->score; - mm_update_extra(r_inv->p, qseq, tseq, mat, opt->q, opt->e); - if (r_inv->p->dp_max < opt->min_dp_max) { // discard this hit - free(r_inv->p); - r_inv->p = 0; - } else { - r_inv->id = -1; - r_inv->parent = MM_PARENT_UNSET; - r_inv->inv = 1; - r_inv->rev = !r1->rev; - r_inv->qs = r1->qe, r_inv->qe = r2->qs; - r_inv->rs = r1->re, r_inv->re = r2->rs; - ret = 1; - } - } + + mm_seq_rev(ql, qseq); + mm_seq_rev(tl, tseq); + qp = ksw_ll_qinit(km, 2, ql, qseq, 5, mat); + score = ksw_ll_i16(qp, tl, tseq, opt->q, opt->e, &q_off, &t_off); + kfree(km, qp); + mm_seq_rev(ql, qseq); + mm_seq_rev(tl, tseq); + if (score < opt->min_dp_max) goto end_align1_inv; + q_off = ql - (q_off + 1), t_off = tl - (t_off + 1); + mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, mat, (int)(opt->bw * 1.5), KSW_EZ_EXTZ_ONLY, ez); + if (ez->n_cigar == 0) goto end_align1_inv; // should never be here + mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); + r_inv->p->dp_score = ez->max; + mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); + r_inv->id = -1; + r_inv->parent = MM_PARENT_UNSET; + r_inv->inv = 1; + r_inv->rev = !r1->rev; + r_inv->qs = r1->qe + q_off, r_inv->qe = r_inv->qs + ez->max_q + 1; + r_inv->rs = r1->re + t_off, r_inv->re = r_inv->rs + ez->max_t + 1; + ret = 1; +end_align1_inv: kfree(km, tseq); return ret; } diff --git a/ksw2.h b/ksw2.h index a925642..da64afe 100644 --- a/ksw2.h +++ b/ksw2.h @@ -67,6 +67,9 @@ int ksw_gg(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *ta int ksw_gg2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); +void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat); +int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int *qe, int *te); + #ifdef __cplusplus } #endif diff --git a/ksw2_ll_sse.c b/ksw2_ll_sse.c new file mode 100644 index 0000000..37311f7 --- /dev/null +++ b/ksw2_ll_sse.c @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include "ksw2.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +typedef struct { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +} kswq_t; + +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + kswq_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +int ksw_ll_i16(void *q_, int tlen, const uint8_t *target, int _gapo, int _gape, int *qe, int *te) +{ + kswq_t *q = (kswq_t*)q_; + int slen, i, gmax = 0, qlen8; + __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + uint16_t *H8; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + *qe = *te = -1; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi16(_gapo + _gape); + gape = _mm_set1_epi16(_gape); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen, qlen8 = slen * 8; + memset(E, 0, slen * sizeof(__m128i)); + memset(H0, 0, slen * sizeof(__m128i)); + memset(Hmax, 0, slen * sizeof(__m128i)); + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + e = _mm_subs_epu16(e, gape); + e = _mm_max_epi16(e, h); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, gape); + f = _mm_max_epi16(f, h); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + f = _mm_subs_epu16(f, gape); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16; + } + } +end_loop_i16: + __max_8(imax, max); + if (imax >= gmax) { + gmax = imax; *te = i; + memcpy(Hmax, H1, slen * sizeof(__m128i)); + } + S = H1; H1 = H0; H0 = S; + } + for (i = 0, H8 = (uint16_t*)Hmax; i < qlen8; ++i) + if ((int)H8[i] == gmax) *qe = i / 8 + i % 8 * slen; + return gmax; +} diff --git a/main.c b/main.c index aec1adf..f435158 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r226-dirty" +#define MM_VERSION "2.0-r227-dirty" void liftrlimit() {