From c8c79ef0241de9693130201a7cc6832900455253 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 6 Nov 2011 16:20:40 -0500 Subject: [PATCH] mate rescue seems working (not MT) --- Makefile | 2 +- bwtsw2.h | 4 +- bwtsw2_aux.c | 64 ++++---- bwtsw2_core.c | 1 + bwtsw2_pair.c | 127 +++++++++++++++- ksw.c | 399 ++++++++++++++++++++++++++++++++++++++++++++++++++ ksw.h | 54 +++++++ 7 files changed, 618 insertions(+), 33 deletions(-) create mode 100644 ksw.c create mode 100644 ksw.h diff --git a/Makefile b/Makefile index 63acf1c..65ce6dd 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ - is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ + is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o diff --git a/bwtsw2.h b/bwtsw2.h index 3272929..deae04b 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,6 +6,8 @@ #include "bwt_lite.h" #include "bwt.h" +#define BSW2_FLAG_MATESW 0x100 + typedef struct { int a, b, q, r, t, qr, bw; int z, is, t_seeds, hard_clip; @@ -50,7 +52,7 @@ extern "C" { bsw2global_t *bsw2_global_init(); void bsw2_global_destroy(bsw2global_t *_pool); - void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); #ifdef __cplusplus } diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index e992aaf..471c450 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -66,22 +66,14 @@ void bsw2_destroy(bwtsw2_t *b) free(b); } -bwtsw2_t *bsw2_dup(const bwtsw2_t *b) +bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) { bwtsw2_t *p; - int i; p = calloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; kroundup32(p->max); p->hits = calloc(p->max, sizeof(bsw2hit_t)); - p->n_cigar = calloc(p->max, sizeof(int)); - p->cigar = calloc(p->max, sizeof(void*)); memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); - for (i = 0; i < p->n; ++i) { - p->n_cigar[i] = b->n_cigar[i]; - p->cigar[i] = malloc(p->n_cigar[i] * 4); - memcpy(p->cigar[i], b->cigar[i], p->n_cigar[i] * 4); - } return p; } @@ -406,9 +398,22 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n return n_cigar; } +static int est_mapq(const bsw2hit_t *p, const bsw2opt_t *opt) +{ + float c = 1.0; + int qual, subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (qual > 250) qual = 250; + if (qual < 0) qual = 0; + if (p->flag&1) qual = 0; // this is a random hit + return qual; +} + /* generate SAM lines for a sequence in ks with alignment stored in * b. ks->name and ks->seq will be freed and set to NULL in the end. */ -static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b) +static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) { int i, k; kstring_t str; @@ -433,18 +438,15 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; } - ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10); + ksprintf(&str, "%s\t%d", ks->name, (p->flag&0xff)|(is_pe?1:0)); ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); if (p->l == 0) { { // estimate mapping quality - float c = 1.0; - int subo = p->G2 > opt->t? p->G2 : opt->t; - if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; - if (p->n_seeds < 2) c *= .2; - qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); - if (qual > 250) qual = 250; - if (qual < 0) qual = 0; - if (p->flag&1) qual = 0; + qual = est_mapq(p, opt); + if ((p->flag & BSW2_FLAG_MATESW) && bmate && bmate->n == 1) { // this alignment is from Smith-Waterman rescue + int mate_qual = est_mapq(bmate->hits, opt); + qual = qual < mate_qual? qual : mate_qual; + } } ksprintf(&str, "\t%d\t", qual); for (k = 0; k < b->n_cigar[i]; ++k) @@ -469,6 +471,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks } else ksprintf(&str, "\t*"); ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn); if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); + if (p->flag&BSW2_FLAG_MATESW) ksprintf(&str, "\tXT:i:1"); kputc('\n', &str); } ks->sam = str.s; @@ -526,7 +529,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases - print_hits(bns, &opt, p, 0); + buf[x] = 0; free(seq[0]); continue; } // alignment @@ -548,17 +551,28 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bsw2_resolve_query_overlaps(b[0], opt.mask_level); } else b[1] = 0; // generate CIGAR and print SAM - gen_cigar(&opt, l, seq, pac, b[0]); - buf[x] = bsw2_dup(b[0]); + buf[x] = bsw2_dup_no_cigar(b[0]); // free free(seq[0]); bsw2_destroy(b[0]); } - bwtsw2_pair(pac, _seq->n, _seq->seq, buf); + if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); for (x = 0; x < _seq->n; ++x) { - print_hits(bns, &opt, &_seq->seq[x], buf[x]); - bsw2_destroy(buf[x]); + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2]; + int i; + seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; + for (i = 0; i < p->l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) c = (int)(drand48() * 4); + seq[0][i] = c; + seq[1][p->l-1-i] = 3 - c; + } + gen_cigar(&opt, p->l, seq, pac, buf[x]); + print_hits(bns, &opt, p, buf[x], is_pe, buf[x^1]); + free(seq[0]); } + for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); free(buf); bsw2_global_destroy(pool); } diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 398a276..67f126c 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -327,6 +327,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int } if (!compatible) { p->G = 0; + if (q->G2 < p->G2) q->G2 = p->G2; break; } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index cf88f8e..e4721d4 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bwt.h" #include "bntseq.h" #include "bwtsw2.h" @@ -10,13 +11,14 @@ #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 #define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 typedef struct { int low, high; double avg, std; } bsw2pestat_t; -bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; @@ -27,11 +29,11 @@ bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) for (i = k = 0; i < n; i += 2) { bsw2hit_t *t[2]; int l; - if (buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough - l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + (t[1]->end - t[1]->beg) : t[1]->k - t[0]->k + (t[0]->end - t[0]->beg); + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; @@ -64,8 +66,121 @@ bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) return r; } -void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +typedef struct { + int n_cigar, beg, end, len; + int64_t pos; + uint32_t *cigar; +} pairaux_t; + +extern unsigned char nst_nt4_table[256]; +static int8_t g_mat[25]; + +void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a) { - bsw2pestat_t pes; - pes = bwtsw2_stat(n, hits); + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); + int64_t k, beg, end; + uint8_t *seq, *ref; + int i; + ksw_query_t *q; + ksw_aux_t aux[2]; + // compute the region start and end + a->n_seeds = 1; a->l = 0; a->flag |= BSW2_FLAG_MATESW; + if (h->is_rev == 0) { + beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); + end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); + a->is_rev = 1; a->flag |= 16; + } else { + beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); + end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); + a->is_rev = 0; + } + if (beg < 1) beg = 1; + if (end > l_pac) end = l_pac; + // generate the sequence + seq = malloc(l_mseq + (end - beg)); + ref = seq + l_mseq; + for (k = beg; k < end; ++k) + ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; + if (h->is_rev == 0) { + for (i = 0; i < l_mseq; ++i) { // on the reverse strand + int c = nst_nt4_table[(int)mseq[i]]; + seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; + } + } else { + for (i = 0; i < l_mseq; ++i) // on the forward strand + seq[i] = nst_nt4_table[(int)mseq[i]]; + } + /* The following code can be made up to 2-fold as fast. I am just lazy... */ + // forward Smith-Waterman + aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; + q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); + ksw_sse2(q, end - beg, ref, &aux[0]); + free(q); + if (aux[0].score == 0) { + free(seq); + return; + } + // reverse Smith-Waterman + seq_reverse(l_mseq, seq, 0); + seq_reverse(end - beg, ref, 0); + q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); + ksw_sse2(q, end - beg, ref, &aux[1]); + free(q); + aux[1].te = end - beg - 1 - aux[1].te; // change to the forward-strand coordinate + // write output + a->G = aux[0].score; + a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + a->k = beg + aux[1].te; + a->len = aux[0].te + 1 - aux[1].te; + a->beg = l_mseq - 1 - aux[1].qe; + a->end = aux[0].qe + 1; + free(seq); +} + +void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); + bsw2pestat_t pes; + int i, j, k, n_rescued = 0; + pes = bsw2_stat(n, hits); + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + g_mat[k++] = i == j? opt->a : -opt->b; + g_mat[k++] = 0; + } + for (i = 0; i < n; i += 2) { + bsw2hit_t a[2]; + memset(&a, 0, sizeof(bsw2hit_t) * 2); + a[0].flag = 1<<6; a[1].flag = 1<<7; + for (j = 0; j < 2; ++j) { // set the read1/2 flag + if (hits[i+j] == 0) continue; + for (k = 0; k < hits[i+j]->n; ++k) { + bsw2hit_t *p = &hits[i+j]->hits[k]; + p->flag |= 1<<(6+j); + } + } + if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N + if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit + if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]); + // the following enumerate all possibilities. It is tedious but necessary... + //if (strstr(seq[i].name, "22_49258265_49258755_4")) fprintf(stderr, "%lld\t%lld\t(%d,%d)\n", hits[i+1]->hits[0].k, a[1].k, a[0].G, a[0].G2); + if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not + bwtsw2_t *p[2]; + int which; + if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; + else p[0] = hits[i+1], p[1] = hits[i], which = 0; + if (a[which].G == 0) continue; + if (p[1]->max == 0) { + p[1]->max = 1; + p[1]->hits = malloc(sizeof(bsw2hit_t)); + } + memcpy(p[1]->hits, &a[which], sizeof(bsw2hit_t)); + p[1]->n = 1; + ++n_rescued; + } else { // then both ends mapped + } + } + fprintf(stderr, "[%s] rescued %d reads\n", __func__, n_rescued); } diff --git a/ksw.c b/ksw.c new file mode 100644 index 0000000..c2b5f9c --- /dev/null +++ b/ksw.c @@ -0,0 +1,399 @@ +/* The MIT License + + Copyright (c) 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include "ksw.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +struct _ksw_query_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; + +ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + ksw_query_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0; + uint64_t *b; + __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + + // initialization + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi8(a->gapo + a->gape); + gape = _mm_set1_epi8(a->gape); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, cmp, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo + e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape + e = _mm_max_epu8(e, h); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, gape); + f = _mm_max_epu8(f, h); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, gapoe); + f = _mm_subs_epu8(f, gape); + cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + if (UNLIKELY(cmp == 0xffff)) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + if (imax >= a->T) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + a->score = gmax; a->te = te; + { // get a->qe, the end of query match; find the 2nd best score + int max = -1, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0, a->qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; + //printf("%d,%d\n", max, gmax); + i = (a->score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0, a->score2 = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) + a->score2 = b[i]>>32, a->te2 = e; + } + } + free(b); + return a->score + q->shift >= 255? 255 : a->score; +} + +int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0; + uint64_t *b; + __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi16(a->gapo + a->gape); + gape = _mm_set1_epi16(a->gape); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + e = _mm_subs_epu16(e, gape); + e = _mm_max_epi16(e, h); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, gape); + f = _mm_max_epi16(f, h); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + f = _mm_subs_epu16(f, gape); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + if (imax >= a->T) { + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + } + S = H1; H1 = H0; H0 = S; + } + a->score = gmax; a->te = te; + { + int max = -1, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, a->qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; + i = (a->score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0, a->score2 = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) + a->score2 = b[i]>>32, a->te2 = e; + } + } + free(b); + return a->score; +} + +int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) +{ + if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); + else return ksw_sse2_8(q, tlen, target, a); +} + +/******************************************* + * Main function (not compiled by default) * + *******************************************/ + +#ifdef _KSW_MAIN + +#include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +int main(int argc, char *argv[]) +{ + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; + int8_t mat[25]; + ksw_aux_t a; + gzFile fpt, fpq; + kseq_t *kst, *ksq; + // parse command line + a.gapo = 5; a.gape = 2; a.T = 10; + while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { + switch (c) { + case 'a': sa = atoi(optarg); break; + case 'b': sb = atoi(optarg); break; + case 'q': a.gapo = atoi(optarg); break; + case 'r': a.gape = atoi(optarg); break; + case 't': a.T = atoi(optarg); break; + case 'f': forward_only = 1; break; + case 's': size = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] \n", size, sa, sb, a.gapo, a.gape); + return 1; + } + // initialize scoring matrix + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? sa : -sb; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; + // open file + fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); + fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); + // all-pair alignment + while (kseq_read(ksq) > 0) { + ksw_query_t *q[2]; + for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; + q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + if (!forward_only) { // reverse + for (i = 0; i < ksq->seq.l/2; ++i) { + int t = ksq->seq.s[i]; + ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; + ksq->seq.s[ksq->seq.l-1-i] = t; + } + for (i = 0; i < ksq->seq.l; ++i) + ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + } else q[1] = 0; + gzrewind(fpt); kseq_rewind(kst); + while (kseq_read(kst) > 0) { + int s; + for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); + printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + if (q[1]) { + s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); + printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + } + } + free(q[0]); free(q[1]); + } + kseq_destroy(kst); gzclose(fpt); + kseq_destroy(ksq); gzclose(fpq); + return 0; +} +#endif diff --git a/ksw.h b/ksw.h new file mode 100644 index 0000000..d93d6a9 --- /dev/null +++ b/ksw.h @@ -0,0 +1,54 @@ +#ifndef __AC_KSW_H +#define __AC_KSW_H + +struct _ksw_query_t; +typedef struct _ksw_query_t ksw_query_t; + +typedef struct { + // input + unsigned gapo, gape; // the first gap costs gapo+gape + unsigned T; // threshold + // output + int score, te, qe, score2, te2; +} ksw_aux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ + ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free() + + /** + * Compute the maximum local score for queries initialized with ksw_qinit(1, ...) + * + * @param q Query data structure returned by ksw_qinit(1, ...) + * @param tlen Length of the target sequence + * @param target Target sequence + * @param a Auxiliary data structure (see ksw.h) + * + * @return The maximum local score; if the returned value equals 255, the SW may not be finished + */ + int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + + /** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */ + int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + + /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ + int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + +#ifdef __cplusplus +} +#endif + +#endif