From 6641788d38ef874d09f8a88a3eac3d9df5ea2aa3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 11:42:31 -0500 Subject: [PATCH 001/169] preparation for further changes --- bwt.c | 27 ++++++++++++++------------- bwt.h | 2 +- fastmap.c | 16 +++++++++------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/bwt.c b/bwt.c index 966b718..e46c125 100644 --- a/bwt.c +++ b/bwt.c @@ -277,7 +277,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -285,37 +285,38 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem mem->n = 0; if (q[x] > 3) return x + 1; + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); kv_init(a[1]); - prev = tmpvec[0]? tmpvec[0] : &a[0]; - curr = tmpvec[1]? tmpvec[1] : &a[1]; - bwt_set_intv(bwt, q[x], ik); + prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { - c = 3 - q[i]; + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] != ik.x[2]) // change of the interval size kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] == 0) break; // cannot be extended + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); - break; // cannot be extended; in this case, ia[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (q[i] > 3) break; + if (i >= 0 && q[i] > 3) break; // always stop at an ambiguous base as the FM-index does not have any. c = i < 0? 0 : q[i]; for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (ok[c].x[2] < min_intv || i == -1) { // keep the hit if reaching the beginning or not extended further if (curr->n == 0) { // curr->n to make sure there is no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; @@ -333,7 +334,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem } bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); + if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } diff --git a/bwt.h b/bwt.h index 5823f82..1eeaceb 100644 --- a/bwt.h +++ b/bwt.h @@ -121,7 +121,7 @@ extern "C" { * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. * Return the end of the longest exact match starting from _x_. */ - int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 4d7a675..2c0a823 100644 --- a/fastmap.c +++ b/fastmap.c @@ -13,11 +13,11 @@ extern unsigned char nst_nt4_table[256]; typedef struct { const bwt_t *bwt; const uint8_t *query; - int start, len; + int start, len, min_intv; bwtintv_v *tmpvec[2], *matches; } smem_i; -smem_i *smem_iter_init(const bwt_t *bwt) +smem_i *smem_iter_init(const bwt_t *bwt, int min_intv) { smem_i *iter; iter = calloc(1, sizeof(smem_i)); @@ -25,6 +25,7 @@ smem_i *smem_iter_init(const bwt_t *bwt) iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); iter->matches = calloc(1, sizeof(bwtintv_v)); + iter->min_intv = min_intv > 0? min_intv : 1; return iter; } @@ -49,13 +50,13 @@ int smem_next(smem_i *iter) if (iter->start >= iter->len || iter->start < 0) return -1; while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec); + iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec); return iter->start; } int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; kseq_t *seq; bwtint_t k; gzFile fp; @@ -63,15 +64,16 @@ int main_fastmap(int argc, char *argv[]) bntseq_t *bns; smem_i *iter; - while ((c = getopt(argc, argv, "w:l:s")) >= 0) { + while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { switch (c) { case 's': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + case 'm': min_intv = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] \n", min_len, min_iwidth, min_intv); return 1; } @@ -86,7 +88,7 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - iter = smem_iter_init(bwt); + iter = smem_iter_init(bwt, min_intv); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { From 543c719a54dc3cb13ab85a4a604507368c5e0fa1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 11:53:07 -0500 Subject: [PATCH 002/169] fixed a couple of unimportant bugs in SMEM --- bwt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bwt.c b/bwt.c index e46c125..c4a008b 100644 --- a/bwt.c +++ b/bwt.c @@ -311,20 +311,19 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (i >= 0 && q[i] > 3) break; // always stop at an ambiguous base as the FM-index does not have any. - c = i < 0? 0 : q[i]; + c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] < min_intv || i == -1) { // keep the hit if reaching the beginning or not extended further - if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (curr->n == 0) { // test curr->n>0 to make sure there is no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match } - if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + if (c >= 0 && (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]))) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } From 6de74888fd7b2d922f05a4dcbd0d1dfd125218cc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 12:12:58 -0500 Subject: [PATCH 003/169] bugfix: min_intv not working in SMEM --- bwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index c4a008b..689d8f8 100644 --- a/bwt.c +++ b/bwt.c @@ -323,7 +323,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, } } // otherwise the match is contained in another longer match } - if (c >= 0 && (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]))) { + if (c >= 0 && ok[c].x[2] >= min_intv && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } From 5a4a0c4173805169f88a7e8d014176579da1adda Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 12:34:05 -0500 Subject: [PATCH 004/169] a bit refactoring for further changes --- bwt.c | 41 +++++++++++++++++++++++++++++++++++++++ bwt.h | 14 ++++++++++++++ fastmap.c | 58 +++++++------------------------------------------------ 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/bwt.c b/bwt.c index 689d8f8..fe8007f 100644 --- a/bwt.c +++ b/bwt.c @@ -337,3 +337,44 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } + +/*************************** + * SMEM iterator interface * + ***************************/ + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); + free(itr->tmpvec[1]->a); + free(itr->matches->a); + free(itr); +} + +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; + itr->min_intv = min_intv; +} + +int smem_next(smem_i *itr) +{ + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; + if (itr->start >= itr->len || itr->start < 0) return -1; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return -1; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + return itr->start; +} diff --git a/bwt.h b/bwt.h index 1eeaceb..67a256d 100644 --- a/bwt.h +++ b/bwt.h @@ -60,6 +60,13 @@ typedef struct { typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len, min_intv; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -123,6 +130,13 @@ extern "C" { */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + // SMEM iterator interface + + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); + int smem_next(smem_i *itr); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 2c0a823..17db06d 100644 --- a/fastmap.c +++ b/fastmap.c @@ -10,50 +10,6 @@ KSEQ_INIT(gzFile, gzread) extern unsigned char nst_nt4_table[256]; -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; -} smem_i; - -smem_i *smem_iter_init(const bwt_t *bwt, int min_intv) -{ - smem_i *iter; - iter = calloc(1, sizeof(smem_i)); - iter->bwt = bwt; - iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - iter->matches = calloc(1, sizeof(bwtintv_v)); - iter->min_intv = min_intv > 0? min_intv : 1; - return iter; -} - -void smem_iter_destroy(smem_i *iter) -{ - free(iter->tmpvec[0]->a); - free(iter->tmpvec[1]->a); - free(iter->matches->a); - free(iter); -} - -void smem_set_query(smem_i *iter, int len, const uint8_t *query) -{ - iter->query = query; - iter->start = 0; - iter->len = len; -} - -int smem_next(smem_i *iter) -{ - iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0; - if (iter->start >= iter->len || iter->start < 0) return -1; - while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases - if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec); - return iter->start; -} - int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; @@ -62,7 +18,7 @@ int main_fastmap(int argc, char *argv[]) gzFile fp; bwt_t *bwt; bntseq_t *bns; - smem_i *iter; + smem_i *itr; while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { switch (c) { @@ -88,7 +44,7 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - iter = smem_iter_init(bwt, min_intv); + itr = smem_itr_init(bwt); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { @@ -97,10 +53,10 @@ int main_fastmap(int argc, char *argv[]) } else putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(iter) > 0) { - for (i = 0; i < iter->matches->n; ++i) { - bwtintv_t *p = &iter->matches->a[i]; + smem_set_query(itr, min_intv, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(itr) > 0) { + for (i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { @@ -120,7 +76,7 @@ int main_fastmap(int argc, char *argv[]) puts("//"); } - smem_iter_destroy(iter); + smem_itr_destroy(itr); bns_destroy(bns); bwt_destroy(bwt); kseq_destroy(seq); From 91debf412b59135bdf5d45c2772e72ce969ca9f1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 13:59:48 -0500 Subject: [PATCH 005/169] move smem iterators to bwamem.{c,h} --- Makefile | 5 ++++- bwamem.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwamem.h | 33 +++++++++++++++++++++++++++++++++ bwt.c | 41 ----------------------------------------- bwt.h | 12 ------------ fastmap.c | 23 +++++++++++++++++++++++ main.c | 2 ++ main.h | 1 + 8 files changed, 117 insertions(+), 54 deletions(-) create mode 100644 bwamem.c create mode 100644 bwamem.h diff --git a/Makefile b/Makefile index 6f388f2..04fd7a0 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ @@ -45,5 +45,8 @@ bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_main.o:bwtsw2.h +bwamem.o:bwamem.h +fastmap.o:bwt.h bwamem.h + clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a diff --git a/bwamem.c b/bwamem.c new file mode 100644 index 0000000..91931bb --- /dev/null +++ b/bwamem.c @@ -0,0 +1,54 @@ +#include +#include "bwamem.h" + +memopt_t *mem_opt_init() +{ + memopt_t *o; + o = calloc(1, sizeof(memopt_t)); + o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; + o->min_seed_len = 17; + o->max_occ = 10; + return o; +} + +/*************************** + * SMEM iterator interface * + ***************************/ + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); + free(itr->tmpvec[1]->a); + free(itr->matches->a); + free(itr); +} + +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; + itr->min_intv = min_intv; +} + +int smem_next(smem_i *itr) +{ + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; + if (itr->start >= itr->len || itr->start < 0) return -1; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return -1; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + return itr->start; +} + diff --git a/bwamem.h b/bwamem.h new file mode 100644 index 0000000..da636a0 --- /dev/null +++ b/bwamem.h @@ -0,0 +1,33 @@ +#ifndef BWAMEM_H_ +#define BWAMEM_H_ + +#include "bwt.h" + +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len, min_intv; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + +typedef struct { + int a, b, q, r, w; + int min_seed_len, max_occ; +} memopt_t; + +#ifdef __cplusplus +extern "C" { +#endif + +smem_i *smem_itr_init(const bwt_t *bwt); +void smem_itr_destroy(smem_i *itr); +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); +int smem_next(smem_i *itr); + +memopt_t *mem_opt_init(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwt.c b/bwt.c index fe8007f..689d8f8 100644 --- a/bwt.c +++ b/bwt.c @@ -337,44 +337,3 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } - -/*************************** - * SMEM iterator interface * - ***************************/ - -smem_i *smem_itr_init(const bwt_t *bwt) -{ - smem_i *itr; - itr = calloc(1, sizeof(smem_i)); - itr->bwt = bwt; - itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - itr->matches = calloc(1, sizeof(bwtintv_v)); - return itr; -} - -void smem_itr_destroy(smem_i *itr) -{ - free(itr->tmpvec[0]->a); - free(itr->tmpvec[1]->a); - free(itr->matches->a); - free(itr); -} - -void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) -{ - itr->query = query; - itr->start = 0; - itr->len = len; - itr->min_intv = min_intv; -} - -int smem_next(smem_i *itr) -{ - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; - if (itr->start >= itr->len || itr->start < 0) return -1; - while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return -1; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); - return itr->start; -} diff --git a/bwt.h b/bwt.h index 67a256d..2aab9d1 100644 --- a/bwt.h +++ b/bwt.h @@ -60,13 +60,6 @@ typedef struct { typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; -} smem_i; - /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -132,11 +125,6 @@ extern "C" { // SMEM iterator interface - smem_i *smem_itr_init(const bwt_t *bwt); - void smem_itr_destroy(smem_i *itr); - void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); - int smem_next(smem_i *itr); - #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 17db06d..6a41aeb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -4,12 +4,35 @@ #include #include "bntseq.h" #include "bwt.h" +#include "bwamem.h" #include "kvec.h" #include "kseq.h" KSEQ_INIT(gzFile, gzread) extern unsigned char nst_nt4_table[256]; +int main_mem(int argc, char *argv[]) +{ + memopt_t *opt; + bwt_t *bwt; + bntseq_t *bns; + int c; + + opt = mem_opt_init(); + while ((c = getopt(argc, argv, "")) >= 0) { + } + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n"); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + free(opt); + return 0; +} + int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; diff --git a/main.c b/main.c index 73cbcd9..2718732 100644 --- a/main.c +++ b/main.c @@ -20,6 +20,7 @@ static int usage() fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -59,6 +60,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); + else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h index 026a80b..1a0292a 100644 --- a/main.h +++ b/main.h @@ -22,6 +22,7 @@ extern "C" { int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); + int main_mem(int argc, char *argv[]); #ifdef __cplusplus } From 6c19c9640ce1df655523bbbb1c41676c454a3c20 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 15:55:22 -0500 Subject: [PATCH 006/169] code backup --- bwamem.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwamem.h | 17 ++++++++++++- 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 91931bb..52c4a18 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1,4 +1,6 @@ #include +#include +#include #include "bwamem.h" memopt_t *mem_opt_init() @@ -8,6 +10,7 @@ memopt_t *mem_opt_init() o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; + o->max_chain_gap = 10000; return o; } @@ -52,3 +55,75 @@ int smem_next(smem_i *itr) return itr->start; } +#include "kbtree.h" + +#define chain_lt(a, b) ((a).pos < (b).pos) +KBTREE_INIT(chn, memchain1_t, chain_lt) + +static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) +{ + int64_t qend, rend, x, y; + const memseed_t *last = &c->seeds[c->n-1]; + qend = last->qbeg + last->len; + rend = last->rbeg + last->len; + if (p->qbeg > c->seeds[0].qbeg && p->qbeg + p->len < qend && p->rbeg > c->seeds[0].rbeg && p->rbeg + p->len < rend) + return 1; // contained seed; do nothing + x = p->qbeg - last->qbeg; // always positive + y = p->rbeg - last->rbeg; + if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { + if (c->n == c->m) { + c->m <<= 1; + c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); + } + c->seeds[c->n++] = *p; + return 1; + } + return 0; +} + +void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +{ + while (smem_next(itr) > 0) { + int i; + for (i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + if (slen >= opt->min_seed_len || p->x[2] > opt->max_occ) continue; + for (k = 0; k < p->x[2]; ++k) { + memchain1_t tmp, *lower, *upper; + memseed_t c1; + int to_add = 0; + c1.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); + c1.qbeg = p->info>>32; + c1.len = slen; + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); + if (!test_and_merge(opt, lower, &c1)) to_add = 1; + } to_add = 1; + if (to_add) { + tmp.n = 1; tmp.m = 4; + tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + kb_putp(chn, tree, &tmp); + } + } + } + } +} + +memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +{ + memchain_t chain; + smem_i *itr; + kbtree_t(chn) *tree; + + memset(&chain, 0, sizeof(memchain_t)); + if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match + tree = kb_init(chn, KB_DEFAULT_SIZE); + itr = smem_itr_init(bwt); + smem_set_query(itr, 1, len, seq); + + smem_itr_destroy(itr); + kb_destroy(chn, tree); + return chain; +} diff --git a/bwamem.h b/bwamem.h index da636a0..cc86ef2 100644 --- a/bwamem.h +++ b/bwamem.h @@ -10,11 +10,26 @@ typedef struct { bwtintv_v *tmpvec[2], *matches; } smem_i; +typedef struct { + int64_t qbeg, rbeg, len; +} memseed_t; + typedef struct { int a, b, q, r, w; - int min_seed_len, max_occ; + int min_seed_len, max_occ, max_chain_gap; } memopt_t; +typedef struct { + int n, m; + int64_t pos; + memseed_t *seeds; +} memchain1_t; + +typedef struct { + int n, m; + memchain1_t *chains; +} memchain_t; + #ifdef __cplusplus extern "C" { #endif From 89777374606f3626e88933251f6ca45d703055ca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 16:26:05 -0500 Subject: [PATCH 007/169] basic chaining working Definitely suboptimal in a lot of corner cases... --- bwamem.c | 27 ++++++++++++++++++--------- bwamem.h | 5 ++++- fastmap.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 52c4a18..2a35fe7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -81,7 +81,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 0; } -void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { while (smem_next(itr) > 0) { int i; @@ -89,21 +89,22 @@ void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) bwtintv_t *p = &itr->matches->a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen >= opt->min_seed_len || p->x[2] > opt->max_occ) continue; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; for (k = 0; k < p->x[2]; ++k) { memchain1_t tmp, *lower, *upper; - memseed_t c1; + memseed_t s; int to_add = 0; - c1.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); - c1.qbeg = p->info>>32; - c1.len = slen; + s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); + s.qbeg = p->info>>32; + s.len = slen; if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); - if (!test_and_merge(opt, lower, &c1)) to_add = 1; - } to_add = 1; + if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; + } else to_add = 1; if (to_add) { tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + tmp.seeds[0] = s; kb_putp(chn, tree, &tmp); } } @@ -111,7 +112,7 @@ void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) } } -memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { memchain_t chain; smem_i *itr; @@ -122,6 +123,14 @@ memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, cons tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); smem_set_query(itr, 1, len, seq); + mem_insert_seed(opt, tree, itr); + + chain.m = kb_size(tree); chain.n = 0; + chain.chains = malloc(chain.m * sizeof(memchain1_t)); + + #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) + __kb_traverse(memchain1_t, tree, traverse_func); + #undef traverse_func smem_itr_destroy(itr); kb_destroy(chn, tree); diff --git a/bwamem.h b/bwamem.h index cc86ef2..72d9557 100644 --- a/bwamem.h +++ b/bwamem.h @@ -11,7 +11,8 @@ typedef struct { } smem_i; typedef struct { - int64_t qbeg, rbeg, len; + int64_t rbeg; + int32_t qbeg, len; } memseed_t; typedef struct { @@ -41,6 +42,8 @@ int smem_next(smem_i *itr); memopt_t *mem_opt_init(void); +memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 6a41aeb..85ccd5c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -16,7 +16,9 @@ int main_mem(int argc, char *argv[]) memopt_t *opt; bwt_t *bwt; bntseq_t *bns; - int c; + int i, j, c; + gzFile *fp; + kseq_t *seq; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -28,6 +30,38 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + fp = gzopen(argv[optind + 1], "r"); + seq = kseq_init(fp); + { // load the packed sequences, BWT and SA + char *tmp = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(tmp, argv[optind]), ".bwt"); + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, argv[optind]), ".sa"); + bwt_restore_sa(tmp, bwt); + free(tmp); + bns = bns_restore(argv[optind]); + } + while (kseq_read(seq) >= 0) { + memchain_t chain; + printf(">%s\n", seq->name.s); + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); + for (i = 0; i < chain.n; ++i) { + memchain1_t *p = &chain.chains[i]; + printf("%d\t%d", i, p->n); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); + printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + putchar('\n'); + } + puts("//"); + } free(opt); return 0; From 5d372cef65ca2a3550629b4a1428b1b256133738 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 16:39:24 -0500 Subject: [PATCH 008/169] bugfix: wrong B-tree comparison --- bwamem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2a35fe7..99b604e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -57,8 +57,8 @@ int smem_next(smem_i *itr) #include "kbtree.h" -#define chain_lt(a, b) ((a).pos < (b).pos) -KBTREE_INIT(chn, memchain1_t, chain_lt) +#define chain_cmp(a, b) ((a).pos - (b).pos) +KBTREE_INIT(chn, memchain1_t, chain_cmp) static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) { @@ -66,7 +66,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * const memseed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; - if (p->qbeg > c->seeds[0].qbeg && p->qbeg + p->len < qend && p->rbeg > c->seeds[0].rbeg && p->rbeg + p->len < rend) + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing x = p->qbeg - last->qbeg; // always positive y = p->rbeg - last->rbeg; From 86f2e134ba40ff7eb04d3cc878bebbaa892788db Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 12:57:48 -0500 Subject: [PATCH 009/169] no effective changes --- bwt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bwt.c b/bwt.c index 689d8f8..32dfc43 100644 --- a/bwt.c +++ b/bwt.c @@ -296,9 +296,10 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] != ik.x[2]) // change of the interval size + if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + } ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); From abc675f2786d9ba2879698ee36727ccdb461ef19 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 13:14:16 -0500 Subject: [PATCH 010/169] typo in comments --- bwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index 32dfc43..2903daa 100644 --- a/bwt.c +++ b/bwt.c @@ -317,7 +317,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough - if (curr->n == 0) { // test curr->n>0 to make sure there is no longer matches + if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); From 620ad6e5b9de135accd4bcd97f42033846293ec5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 14:20:38 -0500 Subject: [PATCH 011/169] reseed long SMEMs --- bwamem.c | 40 ++++++++++++++++++++++++++++++++++------ bwamem.h | 9 +++++---- fastmap.c | 14 +++++++------- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/bwamem.c b/bwamem.c index 99b604e..0ecb190 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,6 +2,7 @@ #include #include #include "bwamem.h" +#include "kvec.h" memopt_t *mem_opt_init() { @@ -26,6 +27,7 @@ smem_i *smem_itr_init(const bwt_t *bwt) itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); return itr; } @@ -34,24 +36,50 @@ void smem_itr_destroy(smem_i *itr) free(itr->tmpvec[0]->a); free(itr->tmpvec[1]->a); free(itr->matches->a); + free(itr->sub->a); free(itr); } -void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +void smem_set_query(smem_i *itr, int len, const uint8_t *query) { itr->query = query; itr->start = 0; itr->len = len; - itr->min_intv = min_intv; } -int smem_next(smem_i *itr) +int smem_next(smem_i *itr, int split_len) { + int i, max, max_i; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; if (itr->start >= itr->len || itr->start < 0) return -1; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return -1; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); + if (itr->matches->n == 0) return itr->start; + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; + int len = (uint32_t)p->info - (p->info>>32); + if (max < len) max = len, max_i = i; + } + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { + int j; + bwtintv_v *a = itr->tmpvec[0]; + bwtintv_t *p = &itr->matches->a[max_i]; + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest match + i = j = 0; a->n = 0; + while (i < itr->matches->n && j < itr->sub->n) { // ordered merge + if (itr->matches->a[i].info < itr->sub->a[j].info) { + kv_push(bwtintv_t, *a, itr->matches->a[i]); + ++i; + } else { + kv_push(bwtintv_t, *a, itr->sub->a[j]); + ++j; + } + } + for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); + for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]); + kv_copy(bwtintv_t, *itr->matches, *a); + } return itr->start; } @@ -83,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { - while (smem_next(itr) > 0) { + while (smem_next(itr, opt->min_seed_len<<1) > 0) { int i; for (i = 0; i < itr->matches->n; ++i) { bwtintv_t *p = &itr->matches->a[i]; @@ -122,7 +150,7 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8 if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); - smem_set_query(itr, 1, len, seq); + smem_set_query(itr, len, seq); mem_insert_seed(opt, tree, itr); chain.m = kb_size(tree); chain.n = 0; diff --git a/bwamem.h b/bwamem.h index 72d9557..1ad9e77 100644 --- a/bwamem.h +++ b/bwamem.h @@ -6,8 +6,9 @@ typedef struct { const bwt_t *bwt; const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; + int start, len; + bwtintv_v *matches; // matches + bwtintv_v *tmpvec[2], *sub; // these are temporary arrays } smem_i; typedef struct { @@ -37,8 +38,8 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); -void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); -int smem_next(smem_i *itr); +void smem_set_query(smem_i *itr, int len, const uint8_t *query); +int smem_next(smem_i *itr, int split_len); memopt_t *mem_opt_init(void); diff --git a/fastmap.c b/fastmap.c index 85ccd5c..6e8a662 100644 --- a/fastmap.c +++ b/fastmap.c @@ -69,7 +69,7 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -77,16 +77,16 @@ int main_fastmap(int argc, char *argv[]) bntseq_t *bns; smem_i *itr; - while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { + while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { switch (c) { - case 's': print_seq = 1; break; + case 's': split_long = 1; break; + case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; - case 'm': min_intv = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] \n", min_len, min_iwidth, min_intv); + fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); return 1; } @@ -110,8 +110,8 @@ int main_fastmap(int argc, char *argv[]) } else putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(itr, min_intv, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(itr) > 0) { + smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(itr, split_long? min_len<<1 : 0) > 0) { for (i = 0; i < itr->matches->n; ++i) { bwtintv_t *p = &itr->matches->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; From f8f3b7577a7112d96682538c2ca8e2428d1469fc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 14:38:44 -0500 Subject: [PATCH 012/169] code cleanup; added a missing file --- bwamem.c | 50 ++++--- bwamem.h | 11 +- fastmap.c | 7 +- kbtree.h | 384 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 421 insertions(+), 31 deletions(-) create mode 100644 kbtree.h diff --git a/bwamem.c b/bwamem.c index 0ecb190..65807fe 100644 --- a/bwamem.c +++ b/bwamem.c @@ -19,6 +19,15 @@ memopt_t *mem_opt_init() * SMEM iterator interface * ***************************/ +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + smem_i *smem_itr_init(const bwt_t *bwt) { smem_i *itr; @@ -47,25 +56,25 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } -int smem_next(smem_i *itr, int split_len) +const bwtintv_v *smem_next(smem_i *itr, int split_len) { int i, max, max_i; - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; - if (itr->start >= itr->len || itr->start < 0) return -1; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return -1; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); - if (itr->matches->n == 0) return itr->start; - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { + if (itr->start == itr->len) return 0; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM + if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; int len = (uint32_t)p->info - (p->info>>32); if (max < len) max = len, max_i = i; } - if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long int j; - bwtintv_v *a = itr->tmpvec[0]; + bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging bwtintv_t *p = &itr->matches->a[max_i]; - bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest match + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge if (itr->matches->a[i].info < itr->sub->a[j].info) { @@ -80,7 +89,7 @@ int smem_next(smem_i *itr, int split_len) for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } - return itr->start; + return itr->matches; } #include "kbtree.h" @@ -98,7 +107,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 1; // contained seed; do nothing x = p->qbeg - last->qbeg; // always positive y = p->rbeg - last->rbeg; - if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { + if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); @@ -106,30 +115,31 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * c->seeds[c->n++] = *p; return 1; } - return 0; + return 0; // request to add a new chain } static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { - while (smem_next(itr, opt->min_seed_len<<1) > 0) { + const bwtintv_v *a; + while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM int i; - for (i = 0; i < itr->matches->n; ++i) { - bwtintv_t *p = &itr->matches->a[i]; + for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start + bwtintv_t *p = &a->a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { memchain1_t tmp, *lower, *upper; memseed_t s; int to_add = 0; - s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); + s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; if (kb_size(tree)) { - kb_intervalp(chn, tree, &tmp, &lower, &upper); + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; } else to_add = 1; - if (to_add) { + if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); tmp.seeds[0] = s; diff --git a/bwamem.h b/bwamem.h index 1ad9e77..eb79586 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,13 +3,8 @@ #include "bwt.h" -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *matches; // matches - bwtintv_v *tmpvec[2], *sub; // these are temporary arrays -} smem_i; +struct __smem_i; +typedef struct __smem_i smem_i; typedef struct { int64_t rbeg; @@ -39,7 +34,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); -int smem_next(smem_i *itr, int split_len); +const bwtintv_v *smem_next(smem_i *itr, int split_len); memopt_t *mem_opt_init(void); diff --git a/fastmap.c b/fastmap.c index 6e8a662..42122b4 100644 --- a/fastmap.c +++ b/fastmap.c @@ -76,6 +76,7 @@ int main_fastmap(int argc, char *argv[]) bwt_t *bwt; bntseq_t *bns; smem_i *itr; + const bwtintv_v *a; while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { switch (c) { @@ -111,9 +112,9 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(itr, split_long? min_len<<1 : 0) > 0) { - for (i = 0; i < itr->matches->n; ++i) { - bwtintv_t *p = &itr->matches->a[i]; + while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) { + for (i = 0; i < a->n; ++i) { + bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { diff --git a/kbtree.h b/kbtree.h new file mode 100644 index 0000000..5ed5330 --- /dev/null +++ b/kbtree.h @@ -0,0 +1,384 @@ +/*- + * Copyright 1997-1999, 2001, John-Mark Gurney. + * 2008-2009, Attractive Chaos + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __AC_KBTREE_H +#define __AC_KBTREE_H + +#include +#include +#include + +typedef struct { + int32_t is_internal:1, n:31; +} kbnode_t; + +#define __KB_KEY(type, x) ((type*)((char*)x + 4)) +#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) + +#define __KB_TREE_T(name) \ + typedef struct { \ + kbnode_t *root; \ + int off_key, off_ptr, ilen, elen; \ + int n, t; \ + int n_keys, n_nodes; \ + } kbtree_##name##_t; + +#define __KB_INIT(name, key_t) \ + kbtree_##name##_t *kb_init_##name(int size) \ + { \ + kbtree_##name##_t *b; \ + b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ + b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ + if (b->t < 2) { \ + free(b); return 0; \ + } \ + b->n = 2 * b->t - 1; \ + b->off_ptr = 4 + b->n * sizeof(key_t); \ + b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ + b->elen = (b->off_ptr + 3) >> 2 << 2; \ + b->root = (kbnode_t*)calloc(1, b->ilen); \ + ++b->n_nodes; \ + return b; \ + } + +#define __kb_destroy(b) do { \ + int i, max = 8; \ + kbnode_t *x, **top, **stack = 0; \ + if (b) { \ + top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ + *top++ = (b)->root; \ + while (top != stack) { \ + x = *--top; \ + if (x->is_internal == 0) { free(x); continue; } \ + for (i = 0; i <= x->n; ++i) \ + if (__KB_PTR(b, x)[i]) { \ + if (top - stack == max) { \ + max <<= 1; \ + stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ + top = stack + (max>>1); \ + } \ + *top++ = __KB_PTR(b, x)[i]; \ + } \ + free(x); \ + } \ + } \ + free(b); free(stack); \ + } while (0) + +#define __kb_get_first(key_t, b, ret) do { \ + kbnode_t *__x = (b)->root; \ + while (__KB_PTR(b, __x)[0] != 0) \ + __x = __KB_PTR(b, __x)[0]; \ + (ret) = __KB_KEY(key_t, __x)[0]; \ + } while (0) + +#define __KB_GET_AUX0(name, key_t, __cmp) \ + static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin, end, n = x->n >> 1; \ + if (x->n == 0) return -1; \ + if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ + begin = 0; end = n; \ + } else { begin = n; end = x->n - 1; } \ + rr = r? r : &tr; \ + n = end; \ + while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ + return n; \ + } + +#define __KB_GET_AUX1(name, key_t, __cmp) \ + static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin = 0, end = x->n; \ + if (x->n == 0) return -1; \ + rr = r? r : &tr; \ + while (begin < end) { \ + int mid = (begin + end) >> 1; \ + if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ + else end = mid; \ + } \ + if (begin == x->n) { *rr = 1; return x->n - 1; } \ + if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ + return begin; \ + } + +#define __KB_GET(name, key_t) \ + static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ + if (x->is_internal == 0) return 0; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + return 0; \ + } \ + static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_getp_##name(b, &k); \ + } + +#define __KB_INTERVAL(name, key_t) \ + static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + *lower = *upper = 0; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) { \ + *lower = *upper = &__KB_KEY(key_t, x)[i]; \ + return; \ + } \ + if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ + if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ + if (x->is_internal == 0) return; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + } \ + static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ + { \ + kb_intervalp_##name(b, &k, lower, upper); \ + } + +#define __KB_PUT(name, key_t, __cmp) \ + /* x must be an internal node */ \ + static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ + { \ + kbnode_t *z; \ + z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ + ++b->n_nodes; \ + z->is_internal = y->is_internal; \ + z->n = b->t - 1; \ + memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ + if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ + y->n = b->t - 1; \ + memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ + __KB_PTR(b, x)[i + 1] = z; \ + memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ + ++x->n; \ + } \ + static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ + { \ + int i = x->n - 1; \ + if (x->is_internal == 0) { \ + i = __kb_getp_aux_##name(x, k, 0); \ + if (i != x->n - 1) \ + memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + __KB_KEY(key_t, x)[i + 1] = *k; \ + ++x->n; \ + } else { \ + i = __kb_getp_aux_##name(x, k, 0) + 1; \ + if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ + __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ + if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ + } \ + __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ + } \ + } \ + static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *r, *s; \ + ++b->n_keys; \ + r = b->root; \ + if (r->n == 2 * b->t - 1) { \ + ++b->n_nodes; \ + s = (kbnode_t*)calloc(1, b->ilen); \ + b->root = s; s->is_internal = 1; s->n = 0; \ + __KB_PTR(b, s)[0] = r; \ + __kb_split_##name(b, s, 0, r); \ + r = s; \ + } \ + __kb_putp_aux_##name(b, r, k); \ + } \ + static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + kb_putp_##name(b, &k); \ + } + + +#define __KB_DEL(name, key_t) \ + static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ + { \ + int yn, zn, i, r = 0; \ + kbnode_t *xp, *y, *z; \ + key_t kp; \ + if (x == 0) return *k; \ + if (s) { /* s can only be 0, 1 or 2 */ \ + r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ + i = s == 1? x->n - 1 : -1; \ + } else i = __kb_getp_aux_##name(x, k, &r); \ + if (x->is_internal == 0) { \ + if (s == 2) ++i; \ + kp = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + --x->n; \ + return kp; \ + } \ + if (r == 0) { \ + if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ + return kp; \ + } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i + 1]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ + return kp; \ + } else if (yn == b->t - 1 && zn == b->t - 1) { \ + y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ + __KB_KEY(key_t, y)[y->n++] = *k; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ + y->n += z->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(z); \ + return __kb_delp_aux_##name(b, y, k, s); \ + } \ + } \ + ++i; \ + if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ + if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ + memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ + __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ + if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ + --y->n; ++xp->n; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ + if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ + --y->n; \ + memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ + } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ + __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + y->n += xp->n; \ + memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ + --x->n; \ + free(xp); \ + xp = y; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ + xp->n += y->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(y); \ + } \ + } \ + return __kb_delp_aux_##name(b, xp, k, s); \ + } \ + static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *x; \ + key_t ret; \ + ret = __kb_delp_aux_##name(b, b->root, k, 0); \ + --b->n_keys; \ + if (b->root->n == 0 && b->root->is_internal) { \ + --b->n_nodes; \ + x = b->root; \ + b->root = __KB_PTR(b, x)[0]; \ + free(x); \ + } \ + return ret; \ + } \ + static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_delp_##name(b, &k); \ + } + +typedef struct { + kbnode_t *x; + int i; +} __kbstack_t; + +#define __kb_traverse(key_t, b, __func) do { \ + int __kmax = 8; \ + __kbstack_t *__kstack, *__kp; \ + __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ + __kp->x = (b)->root; __kp->i = 0; \ + for (;;) { \ + while (__kp->x && __kp->i <= __kp->x->n) { \ + if (__kp - __kstack == __kmax - 1) { \ + __kmax <<= 1; \ + __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ + __kp = __kstack + (__kmax>>1) - 1; \ + } \ + (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ + ++__kp; \ + } \ + --__kp; \ + if (__kp >= __kstack) { \ + if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ + ++__kp->i; \ + } else break; \ + } \ + free(__kstack); \ + } while (0) + +#define KBTREE_INIT(name, key_t, __cmp) \ + __KB_TREE_T(name) \ + __KB_INIT(name, key_t) \ + __KB_GET_AUX1(name, key_t, __cmp) \ + __KB_GET(name, key_t) \ + __KB_INTERVAL(name, key_t) \ + __KB_PUT(name, key_t, __cmp) \ + __KB_DEL(name, key_t) + +#define KB_DEFAULT_SIZE 512 + +#define kbtree_t(name) kbtree_##name##_t +#define kb_init(name, s) kb_init_##name(s) +#define kb_destroy(name, b) __kb_destroy(b) +#define kb_get(name, b, k) kb_get_##name(b, k) +#define kb_put(name, b, k) kb_put_##name(b, k) +#define kb_del(name, b, k) kb_del_##name(b, k) +#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) +#define kb_getp(name, b, k) kb_getp_##name(b, k) +#define kb_putp(name, b, k) kb_putp_##name(b, k) +#define kb_delp(name, b, k) kb_delp_##name(b, k) +#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) + +#define kb_size(b) ((b)->n_keys) + +#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) +#define kb_str_cmp(a, b) strcmp(a, b) + +#endif From 7ab4b3321fd9f44a0f6f30d82eb9a3e326801ee2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 15:26:34 -0500 Subject: [PATCH 013/169] bugfix: memory leak --- bwamem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 65807fe..62df4e5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -42,10 +42,10 @@ smem_i *smem_itr_init(const bwt_t *bwt) void smem_itr_destroy(smem_i *itr) { - free(itr->tmpvec[0]->a); - free(itr->tmpvec[1]->a); - free(itr->matches->a); - free(itr->sub->a); + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); free(itr); } From 00e5302219a595c34ffee2b05a0c7e78883ac2a7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 16:39:50 -0500 Subject: [PATCH 014/169] routine to get subsequence from 2-bit pac --- bntseq.c | 27 +++++++++++++++++++++++++++ bntseq.h | 1 + bwamem.c | 48 ++++++++++++++++++++++++++++++++---------------- bwamem.h | 23 +++++++++++++++-------- fastmap.c | 10 +++++++--- 5 files changed, 82 insertions(+), 27 deletions(-) diff --git a/bntseq.c b/bntseq.c index adcd2d7..18abb2b 100644 --- a/bntseq.c +++ b/bntseq.c @@ -321,3 +321,30 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) } return nn; } + +static inline void get_seq_core(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, uint8_t *seq) +{ + int64_t k, l = 0; + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } +} + +uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) +{ + uint8_t *seq; + if (end > l_pac<<1) end = l_pac<<1; + *len = end - beg; + seq = malloc(end - beg); + if (beg < l_pac && end > l_pac) { + get_seq_core(l_pac, pac, beg, l_pac, seq); + get_seq_core(l_pac, pac, l_pac, end, seq + (l_pac - beg)); + } else get_seq_core(l_pac, pac, beg, end, seq); + return seq; +} diff --git a/bntseq.h b/bntseq.h index 843db64..d4096b4 100644 --- a/bntseq.h +++ b/bntseq.h @@ -72,6 +72,7 @@ extern "C" { void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); #ifdef __cplusplus } diff --git a/bwamem.c b/bwamem.c index 62df4e5..ef09fd5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -3,11 +3,12 @@ #include #include "bwamem.h" #include "kvec.h" +#include "bntseq.h" -memopt_t *mem_opt_init() +mem_opt_t *mem_opt_init() { - memopt_t *o; - o = calloc(1, sizeof(memopt_t)); + mem_opt_t *o; + o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; @@ -95,12 +96,12 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) -KBTREE_INIT(chn, memchain1_t, chain_cmp) +KBTREE_INIT(chn, mem_chain1_t, chain_cmp) -static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) +static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; - const memseed_t *last = &c->seeds[c->n-1]; + const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) @@ -110,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; - c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); + c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; return 1; @@ -118,7 +119,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 0; // request to add a new chain } -static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM @@ -129,8 +130,8 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it int64_t k; if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { - memchain1_t tmp, *lower, *upper; - memseed_t s; + mem_chain1_t tmp, *lower, *upper; + mem_seed_t s; int to_add = 0; s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; @@ -141,7 +142,7 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; kb_putp(chn, tree, &tmp); } @@ -150,13 +151,13 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it } } -memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { - memchain_t chain; + mem_chain_t chain; smem_i *itr; kbtree_t(chn) *tree; - memset(&chain, 0, sizeof(memchain_t)); + memset(&chain, 0, sizeof(mem_chain_t)); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); @@ -164,13 +165,28 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8 mem_insert_seed(opt, tree, itr); chain.m = kb_size(tree); chain.n = 0; - chain.chains = malloc(chain.m * sizeof(memchain1_t)); + chain.chains = malloc(chain.m * sizeof(mem_chain1_t)); #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) - __kb_traverse(memchain1_t, tree, traverse_func); + __kb_traverse(mem_chain1_t, tree, traverse_func); #undef traverse_func smem_itr_destroy(itr); kb_destroy(chn, tree); return chain; } + +mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +{ + mem_aln_t a; + int i, j; + int64_t len; + for (i = 0; i < c->n; ++i) { + mem_seed_t *s = &c->seeds[i]; + uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len); + for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n'); + for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); + free(seq); + } + return a; +} diff --git a/bwamem.h b/bwamem.h index eb79586..0ebd2eb 100644 --- a/bwamem.h +++ b/bwamem.h @@ -9,23 +9,29 @@ typedef struct __smem_i smem_i; typedef struct { int64_t rbeg; int32_t qbeg, len; -} memseed_t; +} mem_seed_t; typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; -} memopt_t; +} mem_opt_t; typedef struct { int n, m; int64_t pos; - memseed_t *seeds; -} memchain1_t; + mem_seed_t *seeds; +} mem_chain1_t; typedef struct { int n, m; - memchain1_t *chains; -} memchain_t; + mem_chain1_t *chains; +} mem_chain_t; + +typedef struct { + int64_t pos; + int n_cigar, len, score; + uint32_t *cigar; +} mem_aln_t; #ifdef __cplusplus extern "C" { @@ -36,9 +42,10 @@ void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); const bwtintv_v *smem_next(smem_i *itr, int split_len); -memopt_t *mem_opt_init(void); +mem_opt_t *mem_opt_init(void); -memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 42122b4..d8a0eca 100644 --- a/fastmap.c +++ b/fastmap.c @@ -13,12 +13,13 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { - memopt_t *opt; + mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; int i, j, c; gzFile *fp; kseq_t *seq; + uint8_t *pac = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -40,15 +41,18 @@ int main_mem(int argc, char *argv[]) bwt_restore_sa(tmp, bwt); free(tmp); bns = bns_restore(argv[optind]); + pac = calloc(bns->l_pac/4+1, 1); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } while (kseq_read(seq) >= 0) { - memchain_t chain; + mem_chain_t chain; printf(">%s\n", seq->name.s); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { - memchain1_t *p = &chain.chains[i]; + mem_chain1_t *p = &chain.chains[i]; + mem_chain2aln(bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; From d25a87cc500b4cb654a3299818ce6982f6667ed2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 2 Feb 2013 15:14:24 -0500 Subject: [PATCH 015/169] code backup --- bwamem.c | 6 ++++-- bwamem.h | 2 +- fastmap.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index ef09fd5..02f9591 100644 --- a/bwamem.c +++ b/bwamem.c @@ -176,10 +176,10 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } -mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j; + int i, j, max, max_i; int64_t len; for (i = 0; i < c->n; ++i) { mem_seed_t *s = &c->seeds[i]; @@ -188,5 +188,7 @@ mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const ui for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); free(seq); } + for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed + if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i; return a; } diff --git a/bwamem.h b/bwamem.h index 0ebd2eb..214d780 100644 --- a/bwamem.h +++ b/bwamem.h @@ -45,7 +45,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); +mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index d8a0eca..c92311e 100644 --- a/fastmap.c +++ b/fastmap.c @@ -52,7 +52,7 @@ int main_mem(int argc, char *argv[]) chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; - mem_chain2aln(bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); + mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; From 92b084e553fd03c617ab60640609b40a9f89eaca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 2 Feb 2013 16:38:21 -0500 Subject: [PATCH 016/169] reimplemented SW extension; not tested yet --- ksw.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/ksw.c b/ksw.c index bd29e96..9ee8453 100644 --- a/ksw.c +++ b/ksw.c @@ -23,12 +23,13 @@ SOFTWARE. */ -#ifndef _NO_SSE2 #include #include -#include #include "ksw.h" +#ifndef _NO_SSE2 +#include + #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #define UNLIKELY(x) __builtin_expect((x),0) @@ -37,6 +38,10 @@ #define UNLIKELY(x) (x) #endif +/*************** + *** SSE2 SW *** + ***************/ + struct _ksw_query_t { int qlen, slen; uint8_t shift, mdiff, max, size; @@ -300,11 +305,82 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) else return ksw_sse2_8(q, tlen, target, a); } +#endif // _NO_SSE2 + +/******************** + *** SW extension *** + ********************/ + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) +{ + eh_t *eh; + int8_t *qp; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + // allocate memory + eh = calloc(qlen + 1, 8); + qp = malloc(qlen * m); + // generate the query profile + for (j = i = 0; j < qlen; ++j) { + const int8_t *p = &mat[query[j] * m]; + for (k = 0; k < m; ++j) qp[i++] = p[k]; + } + // DP loop + eh[0].h = h0; max = 0, max_i = max_j = -1; + beg = 0, end = 1; + for (i = 0; LIKELY(i < tlen); ++i) { + int f = 0, h1 = 0, m = 0, mj = -1, t; + // apply the band and the constraint (if provided) + t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i + if (beg < i - t) beg = i - t; + if (end > i + t + 1) end = i + t + 1; + if (end > qlen) end = qlen; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + h += qp[j]; + h = h > e? h : e; + h = h > f? h : f; + h1 = h; // save H(i,j) to h1 for the next column + mj = m > h? mj : j; + m = m > h? m : h; // m is stored at eh[mj+1] + h -= gapoe; + h = h > 0? h : 0; + e -= gape; + e = e > h? e : h; // computed E(i+1,j) + p->e = e; // save E(i+1,j) for the next row + f -= gape; + f = f > h? f : h; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (m == 0) break; + if (m > max) max = m, max_i = i, max_j = mj; + // update beg and end for the next round + for (j = mj; j > beg && eh[j].h; --j); + beg = j + 1; + for (j = mj + 2; j <= end && eh[j].h; ++j); + end = j; + } + free(eh); free(qp); + if (_qpos) *_qpos = max_i; + if (_tpos) *_tpos = max_j; + return max; +} + /******************************************* * Main function (not compiled by default) * *******************************************/ -#ifdef _KSW_MAIN +#if defined(_KSW_MAIN) && !defined(_NO_SSE2) #include #include @@ -398,4 +474,3 @@ int main(int argc, char *argv[]) return 0; } #endif // _KSW_MAIN -#endif // _NO_SSE2 From e8a1962efe7620114d505599c2e21709b5a46637 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Feb 2013 17:25:40 -0500 Subject: [PATCH 017/169] code backup; it is wrong --- ksw.c | 27 +++++++++++++++++---------- ksw.h | 4 ++++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/ksw.c b/ksw.c index 9ee8453..440fa50 100644 --- a/ksw.c +++ b/ksw.c @@ -314,7 +314,7 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) typedef struct { int32_t h, e; } eh_t; - +#include int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) { eh_t *eh; @@ -324,20 +324,24 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh = calloc(qlen + 1, 8); qp = malloc(qlen * m); // generate the query profile - for (j = i = 0; j < qlen; ++j) { - const int8_t *p = &mat[query[j] * m]; - for (k = 0; k < m; ++j) qp[i++] = p[k]; + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } + // fill the first row + eh[0].h = h0; // DP loop - eh[0].h = h0; max = 0, max_i = max_j = -1; - beg = 0, end = 1; + max = 0, max_i = max_j = -1; + beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1 = 0, m = 0, mj = -1, t; + int8_t *q = &qp[target[i] * qlen]; // apply the band and the constraint (if provided) t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i if (beg < i - t) beg = i - t; if (end > i + t + 1) end = i + t + 1; if (end > qlen) end = qlen; + printf("[%d]\t%d,%d", i, beg, end); for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Similar to SSE2-SW, cells are computed in the following order: @@ -347,7 +351,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh_t *p = &eh[j]; int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) p->h = h1; // set H(i,j-1) for the next row - h += qp[j]; + h += q[j]; h = h > e? h : e; h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column @@ -360,19 +364,22 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, p->e = e; // save E(i+1,j) for the next row f -= gape; f = f > h? f : h; // computed F(i,j+1) + printf("\t%d:%d", j, h1); } + putchar('\n'); eh[end].h = h1; eh[end].e = 0; if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; // update beg and end for the next round - for (j = mj; j > beg && eh[j].h; --j); + for (j = mj; j >= beg && eh[j].h; --j); beg = j + 1; for (j = mj + 2; j <= end && eh[j].h; ++j); end = j; + beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qpos) *_qpos = max_i; - if (_tpos) *_tpos = max_j; + if (_qpos) *_qpos = max_i + 1; + if (_tpos) *_tpos = max_j + 1; return max; } diff --git a/ksw.h b/ksw.h index d93d6a9..b7b9c40 100644 --- a/ksw.h +++ b/ksw.h @@ -1,6 +1,8 @@ #ifndef __AC_KSW_H #define __AC_KSW_H +#include + struct _ksw_query_t; typedef struct _ksw_query_t ksw_query_t; @@ -47,6 +49,8 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos); + #ifdef __cplusplus } #endif From 20933982318a9fe9cd9740ff283e88a7db714c5b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Feb 2013 17:47:57 -0500 Subject: [PATCH 018/169] bugfix: the first line is wrong --- ksw.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ksw.c b/ksw.c index 440fa50..6277bf6 100644 --- a/ksw.c +++ b/ksw.c @@ -320,6 +320,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh_t *eh; int8_t *qp; int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + if (h0 < 0) h0 = 0; // allocate memory eh = calloc(qlen + 1, 8); qp = malloc(qlen * m); @@ -329,19 +330,23 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } // fill the first row - eh[0].h = h0; + eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; + for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) + eh[j].h = eh[j-1].h - gape; // DP loop - max = 0, max_i = max_j = -1; + max = h0, max_i = max_j = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { - int f = 0, h1 = 0, m = 0, mj = -1, t; + int f = 0, h1, m = 0, mj = -1, t; int8_t *q = &qp[target[i] * qlen]; + // compute the first column + h1 = h0 - (gapo + gape * (i + 1)); + if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i if (beg < i - t) beg = i - t; if (end > i + t + 1) end = i + t + 1; if (end > qlen) end = qlen; - printf("[%d]\t%d,%d", i, beg, end); for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Similar to SSE2-SW, cells are computed in the following order: @@ -364,9 +369,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, p->e = e; // save E(i+1,j) for the next row f -= gape; f = f > h? f : h; // computed F(i,j+1) - printf("\t%d:%d", j, h1); } - putchar('\n'); eh[end].h = h1; eh[end].e = 0; if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; @@ -375,7 +378,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, beg = j + 1; for (j = mj + 2; j <= end && eh[j].h; ++j); end = j; - beg = 0; end = qlen; // uncomment this line for debugging + //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); if (_qpos) *_qpos = max_i + 1; From f83dea36d85fd8c1c8e7d3f84638c77202896bbd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Feb 2013 18:16:43 -0500 Subject: [PATCH 019/169] no effective changes --- ksw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ksw.c b/ksw.c index 6277bf6..437f563 100644 --- a/ksw.c +++ b/ksw.c @@ -314,7 +314,7 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) typedef struct { int32_t h, e; } eh_t; -#include + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) { eh_t *eh; From ba18db1a9fe753e81dde187a4432a9795e44fdb0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 12:37:38 -0500 Subject: [PATCH 020/169] sw extension works for the simplest case --- bwamem.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++--------- bwamem.h | 2 ++ fastmap.c | 1 + ksw.c | 17 +++++++++----- ksw.h | 2 +- 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 02f9591..b9e7f68 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1,9 +1,22 @@ #include #include #include +#include #include "bwamem.h" #include "kvec.h" #include "bntseq.h" +#include "ksw.h" + +void mem_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} mem_opt_t *mem_opt_init() { @@ -13,6 +26,7 @@ mem_opt_t *mem_opt_init() o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; + mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -176,19 +190,52 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } +static inline int cal_max_gap(const mem_opt_t *opt, int qlen) +{ + int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); + return l > 1? l : 1; +} + mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j, max, max_i; - int64_t len; - for (i = 0; i < c->n; ++i) { - mem_seed_t *s = &c->seeds[i]; - uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len); - for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n'); - for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); - free(seq); + int i, j, qbeg, qend, score; + int64_t k, rlen, rbeg, rend, rmax[2], tmp; + mem_seed_t *s; + uint8_t *rseq = 0; + // get the start and end of the seeded region + rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; + s = &c->seeds[c->n-1]; + rend = s->rbeg + s->len; qend = s->qbeg + s->len; + // get the max possible span + rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg)); + rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend)); + if (rmax[0] < 0) rmax[0] = 0; + if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1; + // retrieve the reference sequence + rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); + + if (qbeg) { // left extension of the first seed + uint8_t *rs, *qs; + int qle, tle; + qs = malloc(qbeg); + for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i]; + tmp = rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + free(qs); free(rs); + } else score = c->seeds[0].len * opt->a; + + if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed + int qle, tle, qe, re; + s = &c->seeds[0]; + qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); + for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); + score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle); + printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle); } - for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed - if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i; + free(rseq); return a; } diff --git a/bwamem.h b/bwamem.h index 214d780..b026de4 100644 --- a/bwamem.h +++ b/bwamem.h @@ -14,6 +14,7 @@ typedef struct { typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { @@ -43,6 +44,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query); const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); +void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); diff --git a/fastmap.c b/fastmap.c index c92311e..4a677c3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -31,6 +31,7 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + mem_fill_scmat(opt->a, opt->b, opt->mat); fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); { // load the packed sequences, BWT and SA diff --git a/ksw.c b/ksw.c index 437f563..763c774 100644 --- a/ksw.c +++ b/ksw.c @@ -315,11 +315,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle) { eh_t *eh; int8_t *qp; - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory eh = calloc(qlen + 1, 8); @@ -333,8 +333,15 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) eh[j].h = eh[j-1].h - gape; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = w < max_gap? w : max_gap; // DP loop - max = h0, max_i = max_j = 0; + max = h0, max_i = max_j = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1, t; @@ -381,8 +388,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qpos) *_qpos = max_i + 1; - if (_tpos) *_tpos = max_j + 1; + if (_qle) *_qle = max_i + 1; + if (_tle) *_tle = max_j + 1; return max; } diff --git a/ksw.h b/ksw.h index b7b9c40..3c9b959 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle); #ifdef __cplusplus } From 666638a953a8033913f8e8b8a5da7e24c5ec4c45 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 14:51:51 -0500 Subject: [PATCH 021/169] changed the default scoring --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index b9e7f68..54ee3f1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -22,7 +22,7 @@ mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; + o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; From 5bfa45a69b7c9bf8532ec6086e1e6e4ba4b31710 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 15:02:56 -0500 Subject: [PATCH 022/169] write the mem_aln_t struct --- bwamem.c | 24 +++++++++++++++--------- bwamem.h | 4 ++-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 54ee3f1..5fbe937 100644 --- a/bwamem.c +++ b/bwamem.c @@ -199,10 +199,12 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j, qbeg, qend, score; - int64_t k, rlen, rbeg, rend, rmax[2], tmp; + int i, j, qbeg, qend; + int64_t rlen, rbeg, rend, rmax[2], tmp; mem_seed_t *s; uint8_t *rseq = 0; + + memset(&a, 0, sizeof(mem_aln_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; s = &c->seeds[c->n-1]; @@ -223,19 +225,23 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, tmp = rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a.score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a.qb = qbeg - qle; a.rb = rbeg - tle; free(qs); free(rs); - } else score = c->seeds[0].len * opt->a; + } else a.score = c->seeds[0].len * opt->a, a.qb = 0, a.rb = rbeg; - if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed + s = &c->seeds[0]; + if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; - s = &c->seeds[0]; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); - score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle); - printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle); - } + a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, 0, &qle, &tle); + a.qe = qe + qle; a.re = rmax[0] + re + tle; + } else a.qe = l_query, a.re = s->rbeg + s->len; + + printf("[%d] score=%d\t[%d,%d)\t[%lld,%lld)\n", c->n, a.score, a.qb, a.qe, a.rb, a.re); + free(rseq); return a; } diff --git a/bwamem.h b/bwamem.h index b026de4..7bea0ad 100644 --- a/bwamem.h +++ b/bwamem.h @@ -29,8 +29,8 @@ typedef struct { } mem_chain_t; typedef struct { - int64_t pos; - int n_cigar, len, score; + int64_t pos, rb, re; + int n_cigar, len, score, qb, qe; uint32_t *cigar; } mem_aln_t; From f27bd18f2025598bd73b1aee4858ca5f4c43e72f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 15:09:47 -0500 Subject: [PATCH 023/169] check if every seed is included; not used for now --- bwamem.c | 8 +++++++- bwamem.h | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5fbe937..f373ef0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -240,7 +240,13 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, a.qe = qe + qle; a.re = rmax[0] + re + tle; } else a.qe = l_query, a.re = s->rbeg + s->len; - printf("[%d] score=%d\t[%d,%d)\t[%lld,%lld)\n", c->n, a.score, a.qb, a.qe, a.rb, a.re); + a.is_all = 1; + if (c->n > 1) { // check if all the seeds have been included + s = &c->seeds[c->n - 1]; + if (s->qbeg + s->len > a.qe) a.is_all = 0; + } + + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a.score, a.qb, a.qe, a.rb, a.re, a.is_all); free(rseq); return a; diff --git a/bwamem.h b/bwamem.h index 7bea0ad..fae4529 100644 --- a/bwamem.h +++ b/bwamem.h @@ -30,7 +30,7 @@ typedef struct { typedef struct { int64_t pos, rb, re; - int n_cigar, len, score, qb, qe; + int n_cigar, len, score, qb, qe, is_all; uint32_t *cigar; } mem_aln_t; From 788e9d1e3dad7c5477d075371af81f45f1ff55b9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 15:40:26 -0500 Subject: [PATCH 024/169] fixed a couple of leaks; buggy atm --- bwamem.c | 21 +++++++++++++++++++-- fastmap.c | 10 ++++++++-- ksw.c | 4 ++-- ksw.h | 2 +- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index f373ef0..032a54e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -10,7 +10,7 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; - for (i = k = 0; i < 5; ++i) { + for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? a : -b; mat[k++] = 0; // ambiguous base @@ -233,11 +233,28 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, s = &c->seeds[0]; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; + int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); - a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, 0, &qle, &tle); + if (c->n > 1) { // generate $qw + int l = rmax[1] - (s->rbeg + s->len); + assert(l >= 0 && l < 1000); + qw = malloc(l * 2); + for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default + for (i = 1; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + for (j = 0; j < t->len; ++j) { + int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); + assert(x < l); + if (qw[x] == -1) qw[x] = x > y? x - y : y - x; + else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint + } + } + } + a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, qw, &qle, &tle); a.qe = qe + qle; a.re = rmax[0] + re + tle; + free(qw); } else a.qe = l_query, a.re = s->rbeg + s->len; a.is_all = 1; diff --git a/fastmap.c b/fastmap.c index 4a677c3..f3100c7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -17,7 +17,7 @@ int main_mem(int argc, char *argv[]) bwt_t *bwt; bntseq_t *bns; int i, j, c; - gzFile *fp; + gzFile fp; kseq_t *seq; uint8_t *pac = 0; @@ -66,9 +66,15 @@ int main_mem(int argc, char *argv[]) putchar('\n'); } puts("//"); + for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); + free(chain.chains); } - free(opt); + free(pac); free(opt); + bns_destroy(bns); + bwt_destroy(bwt); + kseq_destroy(seq); + gzclose(fp); return 0; } diff --git a/ksw.c b/ksw.c index 763c774..05f597d 100644 --- a/ksw.c +++ b/ksw.c @@ -315,7 +315,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle) { eh_t *eh; int8_t *qp; @@ -350,7 +350,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h1 = h0 - (gapo + gape * (i + 1)); if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) - t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i + t = (qw && qw[i] >= 0 && qw[i] < w)? qw[i] : w; // this is the band width at $i if (beg < i - t) beg = i - t; if (end > i + t + 1) end = i + t + 1; if (end > qlen) end = qlen; diff --git a/ksw.h b/ksw.h index 3c9b959..220a8d7 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle); #ifdef __cplusplus } From 29c8546679ac986141c70b4f7351e766c0e385f3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 16:08:00 -0500 Subject: [PATCH 025/169] better ref extraction --- bwamem.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bwamem.c b/bwamem.c index 032a54e..d7241b6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -197,23 +197,26 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) } mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) -{ +{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds mem_aln_t a; - int i, j, qbeg, qend; - int64_t rlen, rbeg, rend, rmax[2], tmp; - mem_seed_t *s; + int i, j, qbeg; + int64_t rlen, rbeg, rmax[2], tmp; + const mem_seed_t *s; uint8_t *rseq = 0; memset(&a, 0, sizeof(mem_aln_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; - s = &c->seeds[c->n-1]; - rend = s->rbeg + s->len; qend = s->qbeg + s->len; // get the max possible span - rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg)); - rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend)); - if (rmax[0] < 0) rmax[0] = 0; - if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1; + rmax[0] = l_pac<<1; rmax[1] = 0; + for (i = 0; i < c->n; ++i) { + int64_t b, e; + const mem_seed_t *t = &c->seeds[i]; + b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); + e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); + rmax[0] = rmax[0] < b? rmax[0] : b; + rmax[1] = rmax[1] > e? rmax[1] : e; + } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); @@ -239,14 +242,13 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); if (c->n > 1) { // generate $qw int l = rmax[1] - (s->rbeg + s->len); - assert(l >= 0 && l < 1000); qw = malloc(l * 2); for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default for (i = 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; for (j = 0; j < t->len; ++j) { int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); - assert(x < l); + if (x < 0) continue; // overlap with the first seed if (qw[x] == -1) qw[x] = x > y? x - y : y - x; else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint } From c589b42fb5c7deda8f843b85ae6f8ecfb77b1ae9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 16:48:11 -0500 Subject: [PATCH 026/169] minor tuning for fewer identical hits --- bwamem.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index d7241b6..3f14d71 100644 --- a/bwamem.c +++ b/bwamem.c @@ -92,7 +92,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge - if (itr->matches->a[i].info < itr->sub->a[j].info) { + int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); + int64_t xj = itr->matches->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[j].info); + if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; } else { @@ -120,9 +122,9 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_ rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing - x = p->qbeg - last->qbeg; // always positive + x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; - if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain + if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); @@ -190,6 +192,14 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } +/******************** + * Filtering chains * + ********************/ + +/**************************************** + * Construct the alignment from a chain * + ****************************************/ + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -197,7 +207,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) } mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) -{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds +{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds mem_aln_t a; int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; @@ -238,8 +248,8 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; - for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); - for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); +// for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); +// for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); if (c->n > 1) { // generate $qw int l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); From 9d0cdb2d3cceadcfffb6483d59ed47fd94aa9ae7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 17:23:06 -0500 Subject: [PATCH 027/169] unfinished chain filter --- bwamem.c | 74 ++++++++++++++++++++++++++++++++++++++++++++----------- bwamem.h | 2 +- fastmap.c | 3 ++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/bwamem.c b/bwamem.c index 3f14d71..ae70af7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -6,6 +6,7 @@ #include "kvec.h" #include "bntseq.h" #include "ksw.h" +#include "ksort.h" void mem_fill_scmat(int a, int b, int8_t mat[25]) { @@ -109,6 +110,10 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) return itr->matches; } +/******************************** + * Chaining while finding SMEMs * + ********************************/ + #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) @@ -196,6 +201,49 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin * Filtering chains * ********************/ +typedef struct { + int beg, end, w; + void *p, *p2; +} flt_aux_t; + +#define flt_lt(a, b) ((a).w > (b).w) +KSORT_INIT(mem_flt, flt_aux_t, flt_lt) + +void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) +{ + flt_aux_t *a; + int i, j, n; + if (chn->n <= 1) return; // no need to filter + a = malloc(sizeof(flt_aux_t) * chn->n); + for (i = 0; i < chn->n; ++i) { + mem_chain1_t *c = &chn->chains[i]; + int w = 0; + for (j = 0; j < c->n; ++j) w += c->len; + a[i].beg = c->seeds[0].qbeg; + a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; + a[i].w = w; + a[i].p = c; + a[i].w2 = 0; a[i].p2 = 0; + } + ks_introsort(mem_flt, chn->n, a); + for (i = 1, n = 1; i < chn->n; ++i) { + for (j = 0; j < n; ++j) { + int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; + int e_min = e[j].end < a[i].end? a[j].end : a[i].end; + if (e_min > b_max) { // have overlap + int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].p2 == 0) a[j].p2 = a[i].p; + if (a[i].w < a[j].w * opt->chain_drop_ratio) + break; + } + } + } + if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. + } + free(a); +} + /**************************************** * Construct the alignment from a chain * ****************************************/ @@ -206,15 +254,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - mem_aln_t a; int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; - memset(&a, 0, sizeof(mem_aln_t)); + memset(a, 0, sizeof(mem_aln_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; // get the max possible span @@ -238,18 +285,16 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, tmp = rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a.score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); - a.qb = qbeg - qle; a.rb = rbeg - tle; + a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a->qb = qbeg - qle; a->rb = rbeg - tle; free(qs); free(rs); - } else a.score = c->seeds[0].len * opt->a, a.qb = 0, a.rb = rbeg; + } else a->score = c->seeds[0].len * opt->a, a->qb = 0, a->rb = rbeg; s = &c->seeds[0]; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; -// for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); -// for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); if (c->n > 1) { // generate $qw int l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); @@ -264,19 +309,18 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, } } } - a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, qw, &qle, &tle); - a.qe = qe + qle; a.re = rmax[0] + re + tle; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); + a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); - } else a.qe = l_query, a.re = s->rbeg + s->len; + } else a->qe = l_query, a->re = s->rbeg + s->len; - a.is_all = 1; + a->is_all = 1; if (c->n > 1) { // check if all the seeds have been included s = &c->seeds[c->n - 1]; - if (s->qbeg + s->len > a.qe) a.is_all = 0; + if (s->qbeg + s->len > a->qe) a->is_all = 0; } - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a.score, a.qb, a.qe, a.rb, a.re, a.is_all); + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); free(rseq); - return a; } diff --git a/bwamem.h b/bwamem.h index fae4529..0484edf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -47,7 +47,7 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index f3100c7..797a22f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,8 @@ int main_mem(int argc, char *argv[]) chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; - mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); + mem_aln_t a; + mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; From d6a73c9171c14ac4dbfa1c9a2194c0d945ea41eb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 00:17:20 -0500 Subject: [PATCH 028/169] chain filtering apparently working --- bwamem.c | 28 +++++++++++++++++++++++++--- bwamem.h | 2 ++ fastmap.c | 1 + 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index ae70af7..33c911d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -27,6 +27,8 @@ mem_opt_t *mem_opt_init() o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; + o->mask_level = 0.50; + o->chain_drop_ratio = 0.33; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -218,18 +220,18 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) for (i = 0; i < chn->n; ++i) { mem_chain1_t *c = &chn->chains[i]; int w = 0; - for (j = 0; j < c->n; ++j) w += c->len; + for (j = 0; j < c->n; ++j) w += c->seeds[j].len; a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; a[i].p = c; - a[i].w2 = 0; a[i].p2 = 0; + a[i].p2 = 0; } ks_introsort(mem_flt, chn->n, a); for (i = 1, n = 1; i < chn->n; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; - int e_min = e[j].end < a[i].end? a[j].end : a[i].end; + int e_min = a[j].end < a[i].end? a[j].end : a[i].end; if (e_min > b_max) { // have overlap int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap @@ -241,7 +243,27 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) } if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. } + for (i = 0; i < n; ++i) { // mark chains to be kept + mem_chain1_t *c = (mem_chain1_t*)a[i].p; + if (c->n > 0) c->n = -c->n; + c = (mem_chain1_t*)a[i].p2; + if (c && c->n > 0) c->n = -c->n; + } free(a); + for (i = 0; i < chn->n; ++i) { // free discarded chains + mem_chain1_t *c = &chn->chains[i]; + if (c->n >= 0) { + free(c->seeds); + c->n = c->m = 0; + } else c->n = -c->n; + } + for (i = n = 0; i < chn->n; ++i) { // squeeze out discarded chains + if (chn->chains[i].n > 0) { + if (n != i) chn->chains[n++] = chn->chains[i]; + else ++n; + } + } + chn->n = n; } /**************************************** diff --git a/bwamem.h b/bwamem.h index 0484edf..adf57dd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,6 +15,7 @@ typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + float mask_level, chain_drop_ratio; } mem_opt_t; typedef struct { @@ -47,6 +48,7 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn); void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 797a22f..f02224a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -51,6 +51,7 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); + mem_chain_flt(opt, &chain); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; mem_aln_t a; From 7067af833d01f9dbe3ca18295dd11a73fdd87b87 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 00:41:07 -0500 Subject: [PATCH 029/169] fixed a silly bug on sorted merge --- bwamem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 33c911d..fd7caa2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -28,7 +28,7 @@ mem_opt_t *mem_opt_init() o->max_occ = 10; o->max_chain_gap = 10000; o->mask_level = 0.50; - o->chain_drop_ratio = 0.33; + o->chain_drop_ratio = 0.50; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -96,7 +96,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); - int64_t xj = itr->matches->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[j].info); + int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info); if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; From d91e3209724515feb9fdee69f2222e20f0c63a71 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 12:06:56 -0500 Subject: [PATCH 030/169] towards reimplementing banded NW alignment --- ksw.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ksw.h | 1 + 2 files changed, 57 insertions(+) diff --git a/ksw.c b/ksw.c index 05f597d..6282915 100644 --- a/ksw.c +++ b/ksw.c @@ -393,6 +393,62 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, return max; } +/******************** + * Global alignment * + ********************/ + +#define MINUS_INF -0x40000000 + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar) +{ + eh_t *eh; + int8_t *qp; + int i, j, k, gapoe = gapo + gape, score; + // allocate memory + eh = calloc(qlen + 1, 8); + qp = malloc(qlen * m); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = 0; eh[0].e = MINUS_INF; + for (j = 1; j <= qlen && j <= w; ++j) + eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; + // DP loop + for (i = 0; LIKELY(i < tlen); ++i) { + int32_t f = MINUS_INF, h1, beg, end; + int8_t *q = &qp[target[i] * qlen]; + beg = i > w? i - w : 0; + end = i + w + 1 < qlen? i + w + 1 : qlen; + h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; + printf("%d\t%d", i, end); + for (j = beg; LIKELY(j < end); ++j) { + eh_t *p = &eh[j]; + int32_t h = p->h, e = p->e; + p->h = h1; + h += q[j]; + h = h > e? h : e; + h = h > f? h : f; + h1 = h; + printf("\t%d:%d", j, h); + h -= gapoe; + e -= gape; + e = e > h? e : h; + p->e = e; + f -= gape; + f = f > h? f : h; + } + putchar('\n'); + eh[end].h = h1; eh[end].e = MINUS_INF; + } + score = eh[qlen].h; + free(eh); free(qp); + return score; +} + /******************************************* * Main function (not compiled by default) * *******************************************/ diff --git a/ksw.h b/ksw.h index 220a8d7..d58f423 100644 --- a/ksw.h +++ b/ksw.h @@ -50,6 +50,7 @@ extern "C" { int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle); + int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); #ifdef __cplusplus } From 7e1466c8856b4490d4f962766d28ae4fc6742bfe Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 16:05:53 -0500 Subject: [PATCH 031/169] implemented NW backtrack --- ksw.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/ksw.c b/ksw.c index 6282915..7d97b2c 100644 --- a/ksw.c +++ b/ksw.c @@ -27,6 +27,10 @@ #include #include "ksw.h" +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + #ifndef _NO_SSE2 #include @@ -322,8 +326,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory - eh = calloc(qlen + 1, 8); qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -399,14 +403,35 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, #define MINUS_INF -0x40000000 -int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar) +typedef struct { + uint8_t h:2, e:1, f:1; +} btmat_t; + +static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) +{ + if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { + if (*n_cigar == *m_cigar) { + *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; + cigar = realloc(cigar, (*m_cigar) << 4); + } + cigar[(*n_cigar)++] = len<<4 | op; + } else cigar[(*n_cigar)-1] += len<<4; + return cigar; +} + +#define cal_j_(i_, k_, w_) ((k_) - ((i_) > (w_)? (i_) - (w_) : 0)) + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; int8_t *qp; - int i, j, k, gapoe = gapo + gape, score; + int i, j, k, gapoe = gapo + gape, score, n_col; + btmat_t *z; // allocate memory - eh = calloc(qlen + 1, 8); + n_col = qlen < w? qlen : w; + z = malloc(n_col * tlen * sizeof(btmat_t)); qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -421,31 +446,63 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (i = 0; LIKELY(i < tlen); ++i) { int32_t f = MINUS_INF, h1, beg, end; int8_t *q = &qp[target[i] * qlen]; + btmat_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; end = i + w + 1 < qlen? i + w + 1 : qlen; h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; - printf("%d\t%d", i, end); + printf("%d", i); for (j = beg; LIKELY(j < end); ++j) { eh_t *p = &eh[j]; + btmat_t *zij = &zi[j - beg]; int32_t h = p->h, e = p->e; p->h = h1; h += q[j]; + zij->h = h > e? 0 : 1; h = h > e? h : e; + zij->h = h > f? zij->h : 2; h = h > f? h : f; + printf("\t%d:%d:%d", h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); h1 = h; - printf("\t%d:%d", j, h); h -= gapoe; e -= gape; + zij->e = (e > h); // NB: zij->e keeps the direction for the NEXT row, not the current one e = e > h? e : h; p->e = e; f -= gape; + zij->f = (f > h); f = f > h? f : h; + printf(",%d:%d:%d", zij->h, zij->e, zij->f); } putchar('\n'); eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; - free(eh); free(qp); + if (n_cigar_ && cigar_) { // backtrack + int n_cigar = 0, m_cigar = 0, which; + uint32_t *cigar = 0, tmp; + i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell + which = z[i * n_col + cal_j_(i, k, w)].h; + while (i >= 0 && k >= 0) { + printf("(%d,%d)\t%d\n", i, k, which); + if (which == 0) { + cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1); --i, --k; + if (i >= 0 && k >= 0) which = z[i * n_col + cal_j_(i, k, w)].h; + } else if (which == 1) { + cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1); --i; + if (i >= 0) which = z[i * n_col + cal_j_(i, k, w)].e? 1 : 0; + } else { // which == 2 + cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1); --k; + if (k >= 0) which = z[i * n_col + cal_j_(i, k, w)].f? 2 : 0; + } + } + printf("(%d,%d)\t%d\n", i, k, which); + if (i > 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i); + if (k > 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k); + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *n_cigar_ = n_cigar, *cigar_ = cigar; + } + free(eh); free(qp); free(z); return score; } From 1bc9712cd827244159c4bde528fb05bebef5abf9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 16:28:15 -0500 Subject: [PATCH 032/169] explicitly use bit to keep bt matrix This also simplifies backtracking. --- ksw.c | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/ksw.c b/ksw.c index 7d97b2c..654ef3e 100644 --- a/ksw.c +++ b/ksw.c @@ -403,10 +403,6 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, #define MINUS_INF -0x40000000 -typedef struct { - uint8_t h:2, e:1, f:1; -} btmat_t; - static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) { if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { @@ -419,17 +415,15 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, return cigar; } -#define cal_j_(i_, k_, w_) ((k_) - ((i_) > (w_)? (i_) - (w_) : 0)) - int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; int8_t *qp; int i, j, k, gapoe = gapo + gape, score, n_col; - btmat_t *z; + uint8_t *z; // allocate memory - n_col = qlen < w? qlen : w; - z = malloc(n_col * tlen * sizeof(btmat_t)); + n_col = qlen < 2*w+1? qlen : 2*w+1; + z = malloc(n_col * tlen); qp = malloc(qlen * m); eh = calloc(qlen + 1, 8); // generate the query profile @@ -446,58 +440,51 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (i = 0; LIKELY(i < tlen); ++i) { int32_t f = MINUS_INF, h1, beg, end; int8_t *q = &qp[target[i] * qlen]; - btmat_t *zi = &z[i * n_col]; + uint8_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; end = i + w + 1 < qlen? i + w + 1 : qlen; h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; printf("%d", i); for (j = beg; LIKELY(j < end); ++j) { eh_t *p = &eh[j]; - btmat_t *zij = &zi[j - beg]; int32_t h = p->h, e = p->e; + uint8_t d; // direction p->h = h1; h += q[j]; - zij->h = h > e? 0 : 1; + d = h > e? 0 : 1; h = h > e? h : e; - zij->h = h > f? zij->h : 2; + d = h > f? d : 2; h = h > f? h : f; - printf("\t%d:%d:%d", h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); + printf("\t[%d],%d:%d:%d", j, h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); h1 = h; h -= gapoe; e -= gape; - zij->e = (e > h); // NB: zij->e keeps the direction for the NEXT row, not the current one + d |= e > h? 1<<2 : 0; e = e > h? e : h; p->e = e; f -= gape; - zij->f = (f > h); + d |= f > h? 2<<4 : 0; f = f > h? f : h; - printf(",%d:%d:%d", zij->h, zij->e, zij->f); + zi[j - beg] = d; + printf(",%d:%d:%d", d>>0&3, d>>2&3, d>>4&3); } putchar('\n'); eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; if (n_cigar_ && cigar_) { // backtrack - int n_cigar = 0, m_cigar = 0, which; + int n_cigar = 0, m_cigar = 0, which = 0; uint32_t *cigar = 0, tmp; i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell - which = z[i * n_col + cal_j_(i, k, w)].h; while (i >= 0 && k >= 0) { + which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; printf("(%d,%d)\t%d\n", i, k, which); - if (which == 0) { - cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1); --i, --k; - if (i >= 0 && k >= 0) which = z[i * n_col + cal_j_(i, k, w)].h; - } else if (which == 1) { - cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1); --i; - if (i >= 0) which = z[i * n_col + cal_j_(i, k, w)].e? 1 : 0; - } else { // which == 2 - cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1); --k; - if (k >= 0) which = z[i * n_col + cal_j_(i, k, w)].f? 2 : 0; - } + if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; + else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; + else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; } - printf("(%d,%d)\t%d\n", i, k, which); - if (i > 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i); - if (k > 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k); + if (i >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; *n_cigar_ = n_cigar, *cigar_ = cigar; From 86caae811e6d2a256b19990eb58c469f16aae60c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 16:58:35 -0500 Subject: [PATCH 033/169] added comments --- ksw.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/ksw.c b/ksw.c index 654ef3e..66728d5 100644 --- a/ksw.c +++ b/ksw.c @@ -321,8 +321,8 @@ typedef struct { int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle) { - eh_t *eh; - int8_t *qp; + eh_t *eh; // score array + int8_t *qp; // query profile int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory @@ -418,11 +418,11 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; - int8_t *qp; + int8_t *qp; // query profile int i, j, k, gapoe = gapo + gape, score, n_col; - uint8_t *z; + uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex // allocate memory - n_col = qlen < 2*w+1? qlen : 2*w+1; + n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix z = malloc(n_col * tlen); qp = malloc(qlen * m); eh = calloc(qlen + 1, 8); @@ -435,17 +435,18 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh[0].h = 0; eh[0].e = MINUS_INF; for (j = 1; j <= qlen && j <= w; ++j) eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF; - for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band // DP loop - for (i = 0; LIKELY(i < tlen); ++i) { + for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop int32_t f = MINUS_INF, h1, beg, end; int8_t *q = &qp[target[i] * qlen]; uint8_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; - end = i + w + 1 < qlen? i + w + 1 : qlen; + end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; - printf("%d", i); for (j = beg; LIKELY(j < end); ++j) { + // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except: + // 1) not checking h>0; 2) recording direction for backtracking eh_t *p = &eh[j]; int32_t h = p->h, e = p->e; uint8_t d; // direction @@ -455,7 +456,6 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h = h > e? h : e; d = h > f? d : 2; h = h > f? h : f; - printf("\t[%d],%d:%d:%d", j, h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); h1 = h; h -= gapoe; e -= gape; @@ -463,12 +463,10 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, e = e > h? e : h; p->e = e; f -= gape; - d |= f > h? 2<<4 : 0; + d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two f = f > h? f : h; - zi[j - beg] = d; - printf(",%d:%d:%d", d>>0&3, d>>2&3, d>>4&3); + zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell } - putchar('\n'); eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; @@ -478,7 +476,6 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell while (i >= 0 && k >= 0) { which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; - printf("(%d,%d)\t%d\n", i, k, which); if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; From 1e16f3e701b670508b890407cbf9ce2995b4c091 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 17:13:12 -0500 Subject: [PATCH 034/169] calling ksw_global(); ksw_extend() is buggy! --- bwamem.c | 13 +++++++++++-- fastmap.c | 1 + ksw.c | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index fd7caa2..fd164be 100644 --- a/bwamem.c +++ b/bwamem.c @@ -278,7 +278,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, j, qbeg; + int i, j, qbeg, w, nw_score; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; @@ -342,7 +342,16 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); + w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); + w = w < opt->w? w : opt->w; + w += abs((a->re - a->rb) - (a->qe - a->qb)); + nw_score = ksw_global(a->qe - a->qb, query + a->qb, a->re - a->rb, rseq + (a->rb - rmax[0]), 5, opt->mat, opt->q, opt->r, w, &a->n_cigar, &a->cigar); + + printf("[%d] ", c->n); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + printf("[%d] ", c->n); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); + printf("[%d] score=%d,%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, nw_score, a->qb, a->qe, a->rb, a->re, a->is_all); + for (i = 0; i < a->n_cigar; ++i) printf("%d%c", a->cigar[i]>>4, "MIDS"[a->cigar[i]&0xf]); + putchar('\n'); free(rseq); } diff --git a/fastmap.c b/fastmap.c index f02224a..811149f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -66,6 +66,7 @@ int main_mem(int argc, char *argv[]) printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } putchar('\n'); + free(a.cigar); } puts("//"); for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); diff --git a/ksw.c b/ksw.c index 66728d5..6708e40 100644 --- a/ksw.c +++ b/ksw.c @@ -421,6 +421,7 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t *qp; // query profile int i, j, k, gapoe = gapo + gape, score, n_col; uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex + if (n_cigar_) *n_cigar_ = 0; // allocate memory n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix z = malloc(n_col * tlen); From 14e6a7bdb90014e9602fecd733d5e7745986aab8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 17:29:03 -0500 Subject: [PATCH 035/169] fixed a silly bug in ksw_extend() Query return value is assigned to the target variable and vice versa... --- bwamem.c | 6 ++++-- ksw.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index fd164be..1880512 100644 --- a/bwamem.c +++ b/bwamem.c @@ -331,6 +331,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } } } + //printf("[Q] "); for (i = qe; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = re; i < rmax[1] - rmax[0]; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); @@ -347,8 +349,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int w += abs((a->re - a->rb) - (a->qe - a->qb)); nw_score = ksw_global(a->qe - a->qb, query + a->qb, a->re - a->rb, rseq + (a->rb - rmax[0]), 5, opt->mat, opt->q, opt->r, w, &a->n_cigar, &a->cigar); - printf("[%d] ", c->n); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - printf("[%d] ", c->n); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); + //printf("[Q] "); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); printf("[%d] score=%d,%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, nw_score, a->qb, a->qe, a->rb, a->re, a->is_all); for (i = 0; i < a->n_cigar; ++i) printf("%d%c", a->cigar[i]>>4, "MIDS"[a->cigar[i]&0xf]); putchar('\n'); diff --git a/ksw.c b/ksw.c index 6708e40..405bd86 100644 --- a/ksw.c +++ b/ksw.c @@ -392,8 +392,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qle) *_qle = max_i + 1; - if (_tle) *_tle = max_j + 1; + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; return max; } From a61288c7683e011fb8f8750043e6fecfb535b256 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 21:49:19 -0500 Subject: [PATCH 036/169] separate CIGAR generation --- bwamem.c | 42 ++++++++++++++++++++++++++++++++++-------- bwamem.h | 3 +-- fastmap.c | 1 - 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 1880512..54355e3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -278,7 +278,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, j, qbeg, w, nw_score; + int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; @@ -344,16 +344,42 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } - w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); - w = w < opt->w? w : opt->w; - w += abs((a->re - a->rb) - (a->qe - a->qb)); - nw_score = ksw_global(a->qe - a->qb, query + a->qb, a->re - a->rb, rseq + (a->rb - rmax[0]), 5, opt->mat, opt->q, opt->r, w, &a->n_cigar, &a->cigar); - //printf("[Q] "); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); //printf("[R] "); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); - printf("[%d] score=%d,%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, nw_score, a->qb, a->qe, a->rb, a->re, a->is_all); - for (i = 0; i < a->n_cigar; ++i) printf("%d%c", a->cigar[i]>>4, "MIDS"[a->cigar[i]&0xf]); + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); putchar('\n'); free(rseq); } + +uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i, w; + int64_t rlen; + *n_cigar = 0; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], query[rlen - 1 - i] = tmp; + } + // set the band-width + w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); + w = w < 1? w : 1; + w = w < opt->w? w : opt->w; + w += abs(rlen - l_query); + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, opt->mat, opt->q, opt->r, w, n_cigar, &cigar); + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} diff --git a/bwamem.h b/bwamem.h index adf57dd..74eb70a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,8 +31,7 @@ typedef struct { typedef struct { int64_t pos, rb, re; - int n_cigar, len, score, qb, qe, is_all; - uint32_t *cigar; + int len, score, qb, qe, is_all; } mem_aln_t; #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 811149f..f02224a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -66,7 +66,6 @@ int main_mem(int argc, char *argv[]) printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } putchar('\n'); - free(a.cigar); } puts("//"); for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); From 797a8c147e266458d1ea8c1790ea4e1b5666ffc3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 21:58:33 -0500 Subject: [PATCH 037/169] sorting chains while filtering chains --- bwamem.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 54355e3..69f9e81 100644 --- a/bwamem.c +++ b/bwamem.c @@ -220,7 +220,7 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) for (i = 0; i < chn->n; ++i) { mem_chain1_t *c = &chn->chains[i]; int w = 0; - for (j = 0; j < c->n; ++j) w += c->seeds[j].len; + for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; @@ -228,6 +228,16 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) a[i].p2 = 0; } ks_introsort(mem_flt, chn->n, a); + { // reorder chains such that the best chain appears first + mem_chain1_t *swap; + swap = malloc(sizeof(mem_chain1_t) * chn->n); + for (i = 0; i < chn->n; ++i) { + swap[i] = *((mem_chain1_t*)a[i].p); + a[i].p = &chn->chains[i]; // as we will memcpy() below, a[i].p is changed + } + memcpy(chn->chains, swap, sizeof(mem_chain1_t) * chn->n); + free(swap); + } for (i = 1, n = 1; i < chn->n; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; From e65b2096f7eb2ce275202507286cd5af988510e5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 12:25:49 -0500 Subject: [PATCH 038/169] removed useless members --- bwamem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.h b/bwamem.h index 74eb70a..6d8049b 100644 --- a/bwamem.h +++ b/bwamem.h @@ -30,8 +30,8 @@ typedef struct { } mem_chain_t; typedef struct { - int64_t pos, rb, re; - int len, score, qb, qe, is_all; + int64_t rb, re; + int score, qb, qe, is_all; } mem_aln_t; #ifdef __cplusplus From a9292d674d5dcbdccd560f48dc3c55a1e99342d1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 13:59:32 -0500 Subject: [PATCH 039/169] a bit code cleanup --- bwamem.c | 51 +++++++++++++++++++++++---------------------------- bwamem.h | 9 +++++---- fastmap.c | 4 ++-- 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/bwamem.c b/bwamem.c index 69f9e81..c5c6366 100644 --- a/bwamem.c +++ b/bwamem.c @@ -211,34 +211,32 @@ typedef struct { #define flt_lt(a, b) ((a).w > (b).w) KSORT_INIT(mem_flt, flt_aux_t, flt_lt) -void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) { flt_aux_t *a; int i, j, n; - if (chn->n <= 1) return; // no need to filter - a = malloc(sizeof(flt_aux_t) * chn->n); - for (i = 0; i < chn->n; ++i) { - mem_chain1_t *c = &chn->chains[i]; + if (n_chn <= 1) return n_chn; // no need to filter + a = malloc(sizeof(flt_aux_t) * n_chn); + for (i = 0; i < n_chn; ++i) { + mem_chain1_t *c = &chains[i]; int w = 0; for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; - a[i].w = w; - a[i].p = c; - a[i].p2 = 0; + a[i].w = w; a[i].p = c; a[i].p2 = 0; } - ks_introsort(mem_flt, chn->n, a); + ks_introsort(mem_flt, n_chn, a); { // reorder chains such that the best chain appears first mem_chain1_t *swap; - swap = malloc(sizeof(mem_chain1_t) * chn->n); - for (i = 0; i < chn->n; ++i) { + swap = malloc(sizeof(mem_chain1_t) * n_chn); + for (i = 0; i < n_chn; ++i) { swap[i] = *((mem_chain1_t*)a[i].p); - a[i].p = &chn->chains[i]; // as we will memcpy() below, a[i].p is changed + a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed } - memcpy(chn->chains, swap, sizeof(mem_chain1_t) * chn->n); + memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn); free(swap); } - for (i = 1, n = 1; i < chn->n; ++i) { + for (i = 1, n = 1; i < n_chn; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; int e_min = a[j].end < a[i].end? a[j].end : a[i].end; @@ -260,20 +258,20 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) if (c && c->n > 0) c->n = -c->n; } free(a); - for (i = 0; i < chn->n; ++i) { // free discarded chains - mem_chain1_t *c = &chn->chains[i]; + for (i = 0; i < n_chn; ++i) { // free discarded chains + mem_chain1_t *c = &chains[i]; if (c->n >= 0) { free(c->seeds); c->n = c->m = 0; } else c->n = -c->n; } - for (i = n = 0; i < chn->n; ++i) { // squeeze out discarded chains - if (chn->chains[i].n > 0) { - if (n != i) chn->chains[n++] = chn->chains[i]; + for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains + if (chains[i].n > 0) { + if (n != i) chains[n++] = chains[i]; else ++n; } } - chn->n = n; + return n; } /**************************************** @@ -286,14 +284,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; - memset(a, 0, sizeof(mem_aln_t)); + memset(a, 0, sizeof(mem_alnreg_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; // get the max possible span @@ -347,17 +345,14 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); } else a->qe = l_query, a->re = s->rbeg + s->len; - + /* a->is_all = 1; if (c->n > 1) { // check if all the seeds have been included s = &c->seeds[c->n - 1]; if (s->qbeg + s->len > a->qe) a->is_all = 0; } - - //printf("[Q] "); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); - putchar('\n'); + */ + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); free(rseq); } diff --git a/bwamem.h b/bwamem.h index 6d8049b..ef951c3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,8 +31,8 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe, is_all; -} mem_aln_t; + int score, qb, qe; +} mem_alnreg_t; #ifdef __cplusplus extern "C" { @@ -47,8 +47,9 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a); +uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index f02224a..c5667e7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -51,10 +51,10 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); - mem_chain_flt(opt, &chain); + chain.n = mem_chain_flt(opt, chain.n, chain.chains); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; - mem_aln_t a; + mem_alnreg_t a; mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { From 5a0b32bfd24e89f70460c85bdb03d7dd1d87045a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 14:38:40 -0500 Subject: [PATCH 040/169] updated to the latest kseq.h --- Makefile | 2 +- bntseq.c | 2 +- bseq.c | 4 ++ bseq.h | 15 ++++++ bwamem.c | 30 +++++++++++ bwamem.h | 2 +- bwaseqio.c | 2 +- bwtsw2_aux.c | 2 +- fastmap.c | 2 +- kseq.h | 137 ++++++++++++++++++++++++++++++--------------------- simple_dp.c | 2 +- 11 files changed, 138 insertions(+), 62 deletions(-) create mode 100644 bseq.c create mode 100644 bseq.h diff --git a/Makefile b/Makefile index 04fd7a0..46e0b80 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ - bwaseqio.o bwase.o kstring.o + bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ bwape.o cs2nt.o \ diff --git a/bntseq.c b/bntseq.c index 18abb2b..06d82a0 100644 --- a/bntseq.c +++ b/bntseq.c @@ -35,7 +35,7 @@ #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, diff --git a/bseq.c b/bseq.c new file mode 100644 index 0000000..0ec57fa --- /dev/null +++ b/bseq.c @@ -0,0 +1,4 @@ +#include +#include "bseq.h" +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) diff --git a/bseq.h b/bseq.h new file mode 100644 index 0000000..73afb63 --- /dev/null +++ b/bseq.h @@ -0,0 +1,15 @@ +#ifndef BATCHSEQ_H_ +#define BATCHSEQ_H_ + +typedef struct { + char *name, *comment, *seq, *qual; +} bseq1_t; + +typedef struct { + int n, m; + bseq1_t *seqs; +} bseq_t; + +int bseq_read(int chunk_size, bseq_t *bs); + +#endif diff --git a/bwamem.c b/bwamem.c index c5c6366..6d08bb8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -274,6 +274,32 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) return n; } +#define alnreg_lt(a, b) ((a).score > (b).score) +KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) + +int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) +{ // similar to the loop in mem_chain_flt() + int i, j, m; + if (n <= 1) return n; + ks_introsort(mem_ar, n, a); + for (i = 0; i < n; ++i) a[i].sub = 0; + for (i = 1, m = 1; i < n; ++i) { + for (j = 0; j < m; ++j) { + int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; + int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; + if (e_min > b_max) { // have overlap + int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].sub == 0) a[j].sub = a[i].score; + break; + } + } + } + if (j == m) a[m++] = a[i]; + } + return m; +} + /**************************************** * Construct the alignment from a chain * ****************************************/ @@ -388,3 +414,7 @@ ret_gen_cigar: free(rseq); return cigar; } + +/**************** + * Sequence I/O * + ****************/ diff --git a/bwamem.h b/bwamem.h index ef951c3..d69f15a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,7 +31,7 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe; + int score, qb, qe, sub; } mem_alnreg_t; #ifdef __cplusplus diff --git a/bwaseqio.c b/bwaseqio.c index e22d4cd..c1e9f97 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -5,7 +5,7 @@ #include "bamlite.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 5e8161c..619930b 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -15,7 +15,7 @@ #include "kstring.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) #include "ksort.h" #define __left_lt(a, b) ((a).end > (b).end) diff --git a/fastmap.c b/fastmap.c index c5667e7..475667b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -7,7 +7,7 @@ #include "bwamem.h" #include "kvec.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; diff --git a/kseq.h b/kseq.h index ad8937c..a5cec7c 100644 --- a/kseq.h +++ b/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Heng Li + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,6 +23,8 @@ SOFTWARE. */ +/* Last Modified: 05MAR2012 */ + #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -30,9 +32,14 @@ #include #include +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ - char *buf; \ + unsigned char *buf; \ int begin, end, is_eof; \ type_t f; \ } kstream_t; @@ -45,7 +52,7 @@ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ + ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -82,10 +89,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -97,14 +104,20 @@ typedef struct __kstring_t { if (ks->end == 0) break; \ } else break; \ } \ - if (delimiter) { \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ - } else { \ + } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ - } \ - if (str->m - str->l < i - ks->begin + 1) { \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ @@ -117,9 +130,15 @@ typedef struct __kstring_t { break; \ } \ } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -127,19 +146,16 @@ typedef struct __kstring_t { __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ + SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ @@ -152,44 +168,46 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -199,10 +217,19 @@ typedef struct __kstring_t { kstream_t *f; \ } kseq_t; -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif diff --git a/simple_dp.c b/simple_dp.c index 7c078c2..d2b4b71 100644 --- a/simple_dp.c +++ b/simple_dp.c @@ -8,7 +8,7 @@ #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) typedef struct { int l; From 901d28d5f54c6e58a966c233b471c315df453580 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 15:03:09 -0500 Subject: [PATCH 041/169] code backup --- bseq.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ bseq.h | 8 ++------ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/bseq.c b/bseq.c index 0ec57fa..0889851 100644 --- a/bseq.c +++ b/bseq.c @@ -1,4 +1,55 @@ #include +#include +#include +#include #include "bseq.h" #include "kseq.h" KSEQ_INIT2(, gzFile, gzread) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n++]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + *n_ = n; + if (size < chunk_size) { // test if the 2nd file is finished + if (kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + return seqs; +} diff --git a/bseq.h b/bseq.h index 73afb63..b54a268 100644 --- a/bseq.h +++ b/bseq.h @@ -2,14 +2,10 @@ #define BATCHSEQ_H_ typedef struct { + int l_seq; char *name, *comment, *seq, *qual; } bseq1_t; -typedef struct { - int n, m; - bseq1_t *seqs; -} bseq_t; - -int bseq_read(int chunk_size, bseq_t *bs); +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); #endif From a09db6903736bd42933847277c1a734190b0e3ab Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 17:12:27 -0500 Subject: [PATCH 042/169] In bwtsw, replace the batch seq-reader with bseq --- bseq.c | 2 +- bwtsw2_aux.c | 54 ++++++++++++++++++---------------------------------- 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/bseq.c b/bseq.c index 0889851..54a25f6 100644 --- a/bseq.c +++ b/bseq.c @@ -48,7 +48,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) } *n_ = n; if (size < chunk_size) { // test if the 2nd file is finished - if (kseq_read(ks2) >= 0) + if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } return seqs; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 619930b..a18ffc8 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,6 +13,7 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -756,24 +757,14 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) -{ - p->tid = -1; - p->l = ks->seq.l; - p->name = strdup(ks->name.s); - p->seq = strdup(ks->seq.s); - p->qual = ks->qual.l? strdup(ks->qual.s) : 0; - p->comment = ks->comment.l? strdup(ks->comment.s) : 0; - p->sam = 0; -} - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) { gzFile fp, fp2; kseq_t *ks, *ks2; - int l, size = 0, is_pe = 0; + int l, is_pe = 0, i, n; uint8_t *pac; bsw2seq_t *_seq; + bseq1_t *bseq; pac = calloc(bns->l_pac/4+1, 1); if (pac == 0) { @@ -791,34 +782,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while (kseq_read(ks) >= 0) { - if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/') - ks->name.l -= 2, ks->name.s[ks->name.l] = 0; - if (_seq->n == _seq->max) { - _seq->max = _seq->max? _seq->max<<1 : 1024; + while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int size = 0; + if (n > _seq->max) { + _seq->max = n; + kroundup32(_seq->max); _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } - kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); - size += ks->seq.l; - if (ks2) { - if (kseq_read(ks2) >= 0) { - if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/') - ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0; - kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge - size += ks->seq.l; - } else { - fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__); - is_pe = 0; - } - } - if (size > opt->chunk_size * opt->n_threads) { - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); - size = 0; + _seq->n = n; + for (i = 0; i < n; ++i) { + bseq1_t *b = &bseq[i]; + bsw2seq1_t *p = &_seq->seq[i]; + p->tid = -1; p->l = b->l_seq; + p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; + size += p->l; } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); + free(bseq); + process_seqs(_seq, opt, bns, pac, target, is_pe); } - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); // free free(pac); free(_seq->seq); free(_seq); From 5dc398cdef5324a0e9535dcdd59a602007067134 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 13:13:43 -0500 Subject: [PATCH 043/169] start to write CLI --- bwamem.c | 55 ++++++++++++++++++++++++++++++++----------------------- bwamem.h | 23 ++++++++++++++--------- fastmap.c | 35 ++++++++++++++++++++++++++++------- 3 files changed, 74 insertions(+), 39 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6d08bb8..69c085d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -29,6 +29,10 @@ mem_opt_t *mem_opt_init() o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; + o->chunk_size = 10000000; + o->n_threads = 1; + o->pe_dir = 0<<1|1; + o->is_pe = 0; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -119,9 +123,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) -KBTREE_INIT(chn, mem_chain1_t, chain_cmp) +KBTREE_INIT(chn, mem_chain_t, chain_cmp) -static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p) +static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; @@ -153,7 +157,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i int64_t k; if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { - mem_chain1_t tmp, *lower, *upper; + mem_chain_t tmp, *lower, *upper; mem_seed_t s; int to_add = 0; s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference @@ -174,24 +178,23 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i } } -mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { - mem_chain_t chain; + mem_chain_v chain; smem_i *itr; kbtree_t(chn) *tree; - memset(&chain, 0, sizeof(mem_chain_t)); + kv_init(chain); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); smem_set_query(itr, len, seq); mem_insert_seed(opt, tree, itr); - chain.m = kb_size(tree); chain.n = 0; - chain.chains = malloc(chain.m * sizeof(mem_chain1_t)); + kv_resize(mem_chain_t, chain, kb_size(tree)); - #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) - __kb_traverse(mem_chain1_t, tree, traverse_func); + #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); #undef traverse_func smem_itr_destroy(itr); @@ -211,14 +214,14 @@ typedef struct { #define flt_lt(a, b) ((a).w > (b).w) KSORT_INIT(mem_flt, flt_aux_t, flt_lt) -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) { flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { - mem_chain1_t *c = &chains[i]; + mem_chain_t *c = &chains[i]; int w = 0; for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps a[i].beg = c->seeds[0].qbeg; @@ -227,13 +230,13 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) } ks_introsort(mem_flt, n_chn, a); { // reorder chains such that the best chain appears first - mem_chain1_t *swap; - swap = malloc(sizeof(mem_chain1_t) * n_chn); + mem_chain_t *swap; + swap = malloc(sizeof(mem_chain_t) * n_chn); for (i = 0; i < n_chn; ++i) { - swap[i] = *((mem_chain1_t*)a[i].p); + swap[i] = *((mem_chain_t*)a[i].p); a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed } - memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn); + memcpy(chains, swap, sizeof(mem_chain_t) * n_chn); free(swap); } for (i = 1, n = 1; i < n_chn; ++i) { @@ -252,14 +255,14 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. } for (i = 0; i < n; ++i) { // mark chains to be kept - mem_chain1_t *c = (mem_chain1_t*)a[i].p; + mem_chain_t *c = (mem_chain_t*)a[i].p; if (c->n > 0) c->n = -c->n; - c = (mem_chain1_t*)a[i].p2; + c = (mem_chain_t*)a[i].p2; if (c && c->n > 0) c->n = -c->n; } free(a); for (i = 0; i < n_chn; ++i) { // free discarded chains - mem_chain1_t *c = &chains[i]; + mem_chain_t *c = &chains[i]; if (c->n >= 0) { free(c->seeds); c->n = c->m = 0; @@ -310,7 +313,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; @@ -415,6 +418,12 @@ ret_gen_cigar: return cigar; } -/**************** - * Sequence I/O * - ****************/ +static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +{ +} + +int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +{ + int i; + return 0; +} diff --git a/bwamem.h b/bwamem.h index d69f15a..7d921fd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -2,6 +2,9 @@ #define BWAMEM_H_ #include "bwt.h" +#include "bntseq.h" +#include "bseq.h" +#include "kvec.h" struct __smem_i; typedef struct __smem_i smem_i; @@ -14,19 +17,16 @@ typedef struct { typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; - int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + int n_threads, chunk_size; + int pe_dir, is_pe; float mask_level, chain_drop_ratio; + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { int n, m; int64_t pos; mem_seed_t *seeds; -} mem_chain1_t; - -typedef struct { - int n, m; - mem_chain1_t *chains; } mem_chain_t; typedef struct { @@ -34,6 +34,9 @@ typedef struct { int score, qb, qe, sub; } mem_alnreg_t; +typedef kvec_t(mem_chain_t) mem_chain_v; +typedef kvec_t(mem_alnreg_t) mem_alnreg_v; + #ifdef __cplusplus extern "C" { #endif @@ -46,11 +49,13 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); -mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a); +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a); uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); +int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 475667b..32f8db0 100644 --- a/fastmap.c +++ b/fastmap.c @@ -6,6 +6,7 @@ #include "bwt.h" #include "bwamem.h" #include "kvec.h" +#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -16,10 +17,11 @@ int main_mem(int argc, char *argv[]) mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; - int i, j, c; - gzFile fp; - kseq_t *seq; + int i, c, n; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; uint8_t *pac = 0; + bseq1_t *seqs; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -32,8 +34,6 @@ int main_mem(int argc, char *argv[]) return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - fp = gzopen(argv[optind + 1], "r"); - seq = kseq_init(fp); { // load the packed sequences, BWT and SA char *tmp = calloc(strlen(argv[optind]) + 5, 1); strcat(strcpy(tmp, argv[optind]), ".bwt"); @@ -45,6 +45,22 @@ int main_mem(int argc, char *argv[]) pac = calloc(bns->l_pac/4+1, 1); fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } + + fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); + ks = kseq_init(fp); + if (optind + 2 < argc) { + fp2 = gzopen(argv[optind + 2], "r"); + ks2 = kseq_init(fp); + opt->is_pe = 1; + } + while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + mem_process_seqs(opt, bwt, bns, pac, n, seqs); + for (i = 0; i < n; ++i) { + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); + } + free(seqs); + } + /* while (kseq_read(seq) >= 0) { mem_chain_t chain; printf(">%s\n", seq->name.s); @@ -71,12 +87,17 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); free(chain.chains); } + */ - free(pac); free(opt); + free(opt); free(pac); bns_destroy(bns); bwt_destroy(bwt); - kseq_destroy(seq); + kseq_destroy(ks); gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + gzclose(fp2); + } return 0; } From bfeb37c4dedd344a81da30af7bad0a37c033fe81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 13:29:01 -0500 Subject: [PATCH 044/169] code backup --- bwamem.c | 44 +++++++++++++++++++++++++++++++++++++++++++- bwamem.h | 2 +- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 69c085d..51bcd3c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,8 +2,10 @@ #include #include #include +#ifdef HAVE_PTHREAD +#include +#endif #include "bwamem.h" -#include "kvec.h" #include "bntseq.h" #include "ksw.h" #include "ksort.h" @@ -422,8 +424,48 @@ static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t { } +typedef struct { + int start, step, n; + const mem_opt_t *opt; + const bwt_t *bwt; + const bntseq_t *bns; + const uint8_t *pac; + bseq1_t *seqs; +} worker1_t; + +static void *worker1(void *data) +{ + worker1_t *w = (worker1_t*)data; + int i; + for (i = w->start; i < w->n; i += w->step) + process_seq1(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + return 0; +} + int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; + worker1_t *w1; + w1 = calloc(opt->n_threads, sizeof(worker1_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker1_t *w = &w1[i]; + w->start = i; w->step = opt->n_threads; w->n = n; + w->opt = opt; w->bwt = bwt; w->bns = bns; w->pac = pac; + w->seqs = seqs; + } +#ifdef HAVE_PTHREAD + if (opt->n_threads == 1) { + worker1(w1); + } else { + pthread_t *tid; + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w1[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } +#else + worker1(w1); +#endif + free(w1); return 0; } diff --git a/bwamem.h b/bwamem.h index 7d921fd..7215ad3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -34,7 +34,7 @@ typedef struct { int score, qb, qe, sub; } mem_alnreg_t; -typedef kvec_t(mem_chain_t) mem_chain_v; +typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; #ifdef __cplusplus From 1fd51fc3f7ac5887e27f6f2be356bfd295729bcb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 14:36:18 -0500 Subject: [PATCH 045/169] code backup --- bseq.h | 2 +- bwamem.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++------- fastmap.c | 5 +--- kstring.h | 45 ++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 19 deletions(-) diff --git a/bseq.h b/bseq.h index b54a268..978312a 100644 --- a/bseq.h +++ b/bseq.h @@ -3,7 +3,7 @@ typedef struct { int l_seq; - char *name, *comment, *seq, *qual; + char *name, *comment, *seq, *qual, *sam; } bseq1_t; bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); diff --git a/bwamem.c b/bwamem.c index 51bcd3c..6b8d365 100644 --- a/bwamem.c +++ b/bwamem.c @@ -5,6 +5,7 @@ #ifdef HAVE_PTHREAD #include #endif +#include "kstring.h" #include "bwamem.h" #include "bntseq.h" #include "ksw.h" @@ -420,8 +421,51 @@ ret_gen_cigar: return cigar; } -static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +/************************ + * Integrated interface * + ************************/ + +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { + int k, n_cigar = 0, score, is_rev, nn, rid, i; + uint32_t *cigar = 0; + int64_t pos; + kstring_t str; + mem_alnreg_t *p; + + str.l = str.m = 0; str.s = 0; + k = mem_choose_alnreg_se(opt, a->n, a->a); + p = &a->a[k]; + cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + pos = bns_depos(bns, p->rb, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + kputs(s->name, &str); kputc('\t', &str); kputw(is_rev? 16 : 0, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset, &str); kputc('\t', &str); + kputw(0, &str); kputc('\t', &str); + for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; + kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); + if (s->qual) kputsn(s->qual, s->l_seq, &str); + free(cigar); + s->sam = str.s; +} + +static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +{ + int i; + mem_chain_v chn; + mem_alnreg_v regs; + for (i = 0; i < s->l_seq; ++i) + s->seq[i] = nst_nt4_table[(int)s->seq[i]]; + chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); + chn.n = mem_chain_flt(opt, chn.n, chn.a); + regs.n = regs.m = chn.n; + regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); + for (i = 0; i < chn.n; ++i) { + mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], ®s.a[i]); + free(chn.a[i].seeds); + } + free(chn.a); + return regs; } typedef struct { @@ -431,41 +475,65 @@ typedef struct { const bntseq_t *bns; const uint8_t *pac; bseq1_t *seqs; -} worker1_t; + mem_alnreg_v *regs; +} worker_t; static void *worker1(void *data) { - worker1_t *w = (worker1_t*)data; + worker_t *w = (worker_t*)data; int i; for (i = w->start; i < w->n; i += w->step) - process_seq1(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + return 0; +} + +static void *worker2(void *data) +{ + worker_t *w = (worker_t*)data; + int i; + if (!w->opt->is_pe) { + for (i = 0; i < w->n; i += w->step) { + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + free(w->regs[i].a); + } + } else { + for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet + free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); + } + } return 0; } int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; - worker1_t *w1; - w1 = calloc(opt->n_threads, sizeof(worker1_t)); + worker_t *w; + w = calloc(opt->n_threads, sizeof(worker_t)); for (i = 0; i < opt->n_threads; ++i) { - worker1_t *w = &w1[i]; + worker_t *w = &w[i]; w->start = i; w->step = opt->n_threads; w->n = n; w->opt = opt; w->bwt = bwt; w->bns = bns; w->pac = pac; w->seqs = seqs; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { - worker1(w1); + worker1(w); worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w1[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else - worker1(w1); + worker1(w); worker2(w); #endif - free(w1); + for (i = 0; i < n; ++i) { + puts(seqs[i].sam); + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); + } + free(w); return 0; } diff --git a/fastmap.c b/fastmap.c index 32f8db0..812c2db 100644 --- a/fastmap.c +++ b/fastmap.c @@ -17,7 +17,7 @@ int main_mem(int argc, char *argv[]) mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; - int i, c, n; + int c, n; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; uint8_t *pac = 0; @@ -55,9 +55,6 @@ int main_mem(int argc, char *argv[]) } while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { mem_process_seqs(opt, bwt, bns, pac, n, seqs); - for (i = 0; i < n; ++i) { - free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); - } free(seqs); } /* diff --git a/kstring.h b/kstring.h index 398901f..cf14e39 100644 --- a/kstring.h +++ b/kstring.h @@ -16,19 +16,24 @@ typedef struct __kstring_t { } kstring_t; #endif -static inline int kputs(const char *p, kstring_t *s) +static inline int kputsn(const char *p, int l, kstring_t *s) { - int l = strlen(p); if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } - strcpy(s->s + s->l, p); + memcpy(s->s + s->l, p, l); s->l += l; + s->s[s->l] = 0; return l; } +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { @@ -41,6 +46,40 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif From 49f2bcc01570d2744d1b7c4387fc495698b20fcf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 14:57:22 -0500 Subject: [PATCH 046/169] CIGAR is wrong, but the rest is okay --- bwamem.c | 58 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6b8d365..e3f09c1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -427,25 +427,37 @@ ret_gen_cigar: void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int k, n_cigar = 0, score, is_rev, nn, rid, i; - uint32_t *cigar = 0; - int64_t pos; + int k, m; kstring_t str; - mem_alnreg_t *p; str.l = str.m = 0; str.s = 0; - k = mem_choose_alnreg_se(opt, a->n, a->a); - p = &a->a[k]; - cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); - pos = bns_depos(bns, p->rb, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - kputs(s->name, &str); kputc('\t', &str); kputw(is_rev? 16 : 0, &str); kputc('\t', &str); - kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset, &str); kputc('\t', &str); - kputw(0, &str); kputc('\t', &str); - for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; - kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); - if (s->qual) kputsn(s->qual, s->l_seq, &str); - free(cigar); + m = mem_choose_alnreg_se(opt, a->n, a->a); + for (k = 0; k < m; ++k) { + uint32_t *cigar = 0; + int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0; + int64_t pos, end; + mem_alnreg_t *p = &a->a[k]; + cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + flag |= is_rev? 16 : 0; + kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); + kputw(0, &str); kputc('\t', &str); + if (n_cigar) { + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, &str); kputc("MIDSH"[cigar[i]&0xf], &str); + } + } else kputc('*', &str); + kputsn("\t*\t0\t0\t", 7, &str); + for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; + kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); + if (s->qual) kputsn(s->qual, s->l_seq, &str); + kputsn("\tAS:i:", 6, &str); kputw(score, &str); + kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); + kputc('\n', &str); + free(cigar); + } s->sam = str.s; } @@ -508,12 +520,14 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns { int i; worker_t *w; + mem_alnreg_v *regs; w = calloc(opt->n_threads, sizeof(worker_t)); + regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { - worker_t *w = &w[i]; - w->start = i; w->step = opt->n_threads; w->n = n; - w->opt = opt; w->bwt = bwt; w->bns = bns; w->pac = pac; - w->seqs = seqs; + worker_t *p = &w[i]; + p->start = i; p->step = opt->n_threads; p->n = n; + p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; + p->seqs = seqs; p->regs = regs; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { @@ -531,9 +545,9 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns worker1(w); worker2(w); #endif for (i = 0; i < n; ++i) { - puts(seqs[i].sam); + fputs(seqs[i].sam, stdout); free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } - free(w); + free(regs); free(w); return 0; } From 27fdf6397db0568a72cbc598f9292d2f75fa93fb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 15:52:36 -0500 Subject: [PATCH 047/169] single-end working! no mapQ, though --- bwamem.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index e3f09c1..0d864d3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -371,8 +371,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } } } - //printf("[Q] "); for (i = qe; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = re; i < rmax[1] - rmax[0]; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); @@ -384,8 +382,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } */ - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); - + //printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); free(rseq); } @@ -403,8 +400,10 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; for (i = 0; i < rlen>>1; ++i) - tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], query[rlen - 1 - i] = tmp; + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); // set the band-width w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); w = w < 1? w : 1; @@ -429,18 +428,21 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b { int k, m; kstring_t str; + char *seq; str.l = str.m = 0; str.s = 0; m = mem_choose_alnreg_se(opt, a->n, a->a); + seq = malloc(s->l_seq); for (k = 0; k < m; ++k) { uint32_t *cigar = 0; int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0; - int64_t pos, end; + int64_t pos; mem_alnreg_t *p = &a->a[k]; cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); flag |= is_rev? 16 : 0; + if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); kputw(0, &str); kputc('\t', &str); @@ -450,14 +452,17 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b } } else kputc('*', &str); kputsn("\t*\t0\t0\t", 7, &str); - for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; - kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); + if (is_rev) for (i = s->l_seq - 1; i >= 0; --i) seq[i] = "TGCAN"[(int)s->seq[i]]; + else for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; + kputsn(seq, s->l_seq, &str); kputc('\t', &str); if (s->qual) kputsn(s->qual, s->l_seq, &str); - kputsn("\tAS:i:", 6, &str); kputw(score, &str); + kputsn("\tAS:i:", 6, &str); kputw(p->score, &str); kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); + kputsn("\tnw:i:", 6, &str); kputw(score, &str); kputc('\n', &str); free(cigar); } + free(seq); s->sam = str.s; } From ff3fea115cefc4b84909720ad89c24127893816c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 16:27:11 -0500 Subject: [PATCH 048/169] write soft clip; added debugging code --- bwamem.c | 30 ++++++++++++++++++++++++++++-- fastmap.c | 31 ++----------------------------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0d864d3..f85d5e3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -181,6 +181,24 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i } } +void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) +{ + int i, j; + for (i = 0; i < chn->n; ++i) { + mem_chain_t *p = &chn->a[i]; + printf("%d", p->n); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); + printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + putchar('\n'); + } +} + mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { mem_chain_v chain; @@ -318,7 +336,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, j, qbeg; + int i, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; @@ -357,8 +375,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; +#if 0 if (c->n > 1) { // generate $qw - int l = rmax[1] - (s->rbeg + s->len); + int j, l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default for (i = 1; i < c->n; ++i) { @@ -371,6 +390,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } } } +#endif a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); @@ -447,9 +467,14 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); kputw(0, &str); kputc('\t', &str); if (n_cigar) { + int clip5, clip3; + clip5 = is_rev? s->l_seq - p->qe : p->qb; + clip3 = is_rev? p->qb : s->l_seq - p->qe; + if (clip5) { kputw(clip5, &str); kputc('S', &str); } for (i = 0; i < n_cigar; ++i) { kputw(cigar[i]>>4, &str); kputc("MIDSH"[cigar[i]&0xf], &str); } + if (clip3) { kputw(clip3, &str); kputc('S', &str); } } else kputc('*', &str); kputsn("\t*\t0\t0\t", 7, &str); if (is_rev) for (i = s->l_seq - 1; i >= 0; --i) seq[i] = "TGCAN"[(int)s->seq[i]]; @@ -475,6 +500,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); + //mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { diff --git a/fastmap.c b/fastmap.c index 812c2db..b27b1df 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,7 +24,8 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "")) >= 0) { + while ((c = getopt(argc, argv, "k:")) >= 0) { + if (c == 'k') opt->min_seed_len = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -57,34 +58,6 @@ int main_mem(int argc, char *argv[]) mem_process_seqs(opt, bwt, bns, pac, n, seqs); free(seqs); } - /* - while (kseq_read(seq) >= 0) { - mem_chain_t chain; - printf(">%s\n", seq->name.s); - for (i = 0; i < seq->seq.l; ++i) - seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); - chain.n = mem_chain_flt(opt, chain.n, chain.chains); - for (i = 0; i < chain.n; ++i) { - mem_chain1_t *p = &chain.chains[i]; - mem_alnreg_t a; - mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a); - printf("%d\t%d", i, p->n); - for (j = 0; j < p->n; ++j) { - bwtint_t pos; - int is_rev, ref_id; - pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); - if (is_rev) pos -= p->seeds[j].len - 1; - bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); - printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); - } - putchar('\n'); - } - puts("//"); - for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); - free(chain.chains); - } - */ free(opt); free(pac); bns_destroy(bns); From 6ba11ab68cd42763e4fe2911210f5f04211b785d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 16:42:01 -0500 Subject: [PATCH 049/169] no effective changes --- bwamem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index f85d5e3..e3805d3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -375,7 +375,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; -#if 0 +#if 0 // FIXME: I am not sure if the following block works. Comment it out if SW extension gives unexpected result. if (c->n > 1) { // generate $qw int j, l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); @@ -385,10 +385,11 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (j = 0; j < t->len; ++j) { int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); if (x < 0) continue; // overlap with the first seed - if (qw[x] == -1) qw[x] = x > y? x - y : y - x; + if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint } } +// for (i = 0; i < l; ++i) printf("%d:%d\t", i, qw[i]); putchar('\n'); } #endif a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); @@ -500,7 +501,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - //mem_print_chain(bns, &chn); +// mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { From 83a49f32100868c7ca91d17c08f9bf59e4c489e1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 17:15:45 -0500 Subject: [PATCH 050/169] compute mapQ; extend from the longest seed --- bwamem.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index e3805d3..6b914b5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef HAVE_PTHREAD #include #endif @@ -11,6 +12,8 @@ #include "ksw.h" #include "ksort.h" +#define MAPQ_COEF 40. + void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; @@ -336,14 +339,12 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, qbeg; + int i, qbeg, max = 0, max_i = -1; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; memset(a, 0, sizeof(mem_alnreg_t)); - // get the start and end of the seeded region - rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -353,7 +354,10 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; + if (max < t->len) max = t->len, max_i = i; } + // get the start and end of the seeded region + rbeg = c->seeds[max_i].rbeg; qbeg = c->seeds[max_i].qbeg; // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); @@ -365,12 +369,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int tmp = rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[max_i].len * opt->a, 0, &qle, &tle); a->qb = qbeg - qle; a->rb = rbeg - tle; free(qs); free(rs); - } else a->score = c->seeds[0].len * opt->a, a->qb = 0, a->rb = rbeg; + } else a->score = c->seeds[max_i].len * opt->a, a->qb = 0, a->rb = rbeg; - s = &c->seeds[0]; + s = &c->seeds[max_i]; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; int16_t *qw = 0; @@ -456,7 +460,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b seq = malloc(s->l_seq); for (k = 0; k < m; ++k) { uint32_t *cigar = 0; - int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0; + int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0, mapq = 0; int64_t pos; mem_alnreg_t *p = &a->a[k]; cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); @@ -466,7 +470,9 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - kputw(0, &str); kputc('\t', &str); + mapq = p->score? (int)(MAPQ_COEF * (1. - (float)(p->sub? p->sub : opt->min_seed_len * opt->a) / p->score) * log(p->score / opt->a) + .499) : 0; + if (mapq > 60) mapq = 60; + kputw(mapq, &str); kputc('\t', &str); if (n_cigar) { int clip5, clip3; clip5 = is_rev? s->l_seq - p->qe : p->qb; From cd6bd524d48e83a338c8566c2343827a036188c5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 19:50:37 -0500 Subject: [PATCH 051/169] discard internal seeds shorter than half --- bwamem.c | 20 +++++++++++++------- fastmap.c | 5 ++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6b914b5..456edef 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,6 +14,8 @@ #define MAPQ_COEF 40. +int mem_debug = 0; + void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; @@ -30,8 +32,8 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; - o->min_seed_len = 17; - o->max_occ = 10; + o->min_seed_len = 19; + o->max_occ = 50; o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; @@ -84,6 +86,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } + const bwtintv_v *smem_next(smem_i *itr, int split_len) { int i, max, max_i; @@ -110,13 +113,15 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; - } else { + } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) { kv_push(bwtintv_t, *a, itr->sub->a[j]); ++j; - } + } else ++j; } for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); - for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]); + for (; j < itr->sub->n; ++j) + if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) + kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } return itr->matches; @@ -407,7 +412,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } */ - //printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); + if (mem_debug >= 2) + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); free(rseq); } @@ -507,7 +513,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); -// mem_print_chain(bns, &chn); + if (mem_debug >= 1) mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { diff --git a/fastmap.c b/fastmap.c index b27b1df..0d2354a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,6 +11,7 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; +extern int mem_debug; int main_mem(int argc, char *argv[]) { @@ -24,8 +25,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:D:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'c') opt->max_occ = atoi(optarg); + else if (c == 'D') mem_debug = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); From 45b0d3423aba34dfab8128cd250d7246a6e791fd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 20:07:31 -0500 Subject: [PATCH 052/169] bugfix: when no seed hits found --- bwamem.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 456edef..15a903c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -457,16 +457,24 @@ ret_gen_cigar: void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int k, m; + int i, k, m; kstring_t str; char *seq; str.l = str.m = 0; str.s = 0; m = mem_choose_alnreg_se(opt, a->n, a->a); seq = malloc(s->l_seq); + if (m == 0) { // no seeds found + for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; + kputs(s->name, &str); kputs("\t8\t*\t0\t0\t*\t*\t0\t0\t", &str); + kputsn(seq, s->l_seq, &str); + if (s->qual) kputsn(s->qual, s->l_seq, &str); + else kputc('*', &str); + kputc('\n', &str); + } for (k = 0; k < m; ++k) { uint32_t *cigar = 0; - int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0, mapq = 0; + int score, is_rev, nn, rid, flag = 0, n_cigar = 0, mapq = 0; int64_t pos; mem_alnreg_t *p = &a->a[k]; cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); @@ -494,6 +502,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b else for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; kputsn(seq, s->l_seq, &str); kputc('\t', &str); if (s->qual) kputsn(s->qual, s->l_seq, &str); + else kputc('*', &str); kputsn("\tAS:i:", 6, &str); kputw(p->score, &str); kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); kputsn("\tnw:i:", 6, &str); kputw(score, &str); From d890c7997cc5ab075b123cec42c0e656420ceefd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 21:20:36 -0500 Subject: [PATCH 053/169] better treatment for micro-repeat --- bwamem.c | 100 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/bwamem.c b/bwamem.c index 15a903c..0e6f3a2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -344,12 +344,13 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, qbeg, max = 0, max_i = -1; - int64_t rlen, rbeg, rmax[2], tmp; + int i, k; + int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; const mem_seed_t *s; uint8_t *rseq = 0; + mem_alnreg_t best; - memset(a, 0, sizeof(mem_alnreg_t)); + memset(&best, 0, sizeof(mem_alnreg_t)); // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -359,61 +360,64 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; - if (max < t->len) max = t->len, max_i = i; + if (t->len > max) max = t->len, max_i = i; } - // get the start and end of the seeded region - rbeg = c->seeds[max_i].rbeg; qbeg = c->seeds[max_i].qbeg; // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); - if (qbeg) { // left extension of the first seed - uint8_t *rs, *qs; - int qle, tle; - qs = malloc(qbeg); - for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i]; - tmp = rbeg - rmax[0]; - rs = malloc(tmp); - for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[max_i].len * opt->a, 0, &qle, &tle); - a->qb = qbeg - qle; a->rb = rbeg - tle; - free(qs); free(rs); - } else a->score = c->seeds[max_i].len * opt->a, a->qb = 0, a->rb = rbeg; + for (k = 0; k < c->n;) { + s = &c->seeds[k]; + memset(a, 0, sizeof(mem_alnreg_t)); + if (s->qbeg) { // left extension + uint8_t *rs, *qs; + int qle, tle; + qs = malloc(s->qbeg); + for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; + tmp = s->rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, 0, &qle, &tle); + a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; + free(qs); free(rs); + } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; - s = &c->seeds[max_i]; - if (s->qbeg + s->len != l_query) { // right extension of the first seed - int qle, tle, qe, re; - int16_t *qw = 0; - qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + if (s->qbeg + s->len != l_query) { // right extension of the first seed + int qle, tle, qe, re; + int16_t *qw = 0; + qe = s->qbeg + s->len; + re = s->rbeg + s->len - rmax[0]; #if 0 // FIXME: I am not sure if the following block works. Comment it out if SW extension gives unexpected result. - if (c->n > 1) { // generate $qw - int j, l = rmax[1] - (s->rbeg + s->len); - qw = malloc(l * 2); - for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default - for (i = 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - for (j = 0; j < t->len; ++j) { - int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); - if (x < 0) continue; // overlap with the first seed - if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 - else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint + if (c->n > 1) { // generate $qw + int j, l = rmax[1] - (s->rbeg + s->len); + qw = malloc(l * 2); + for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default + for (i = 1; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + for (j = 0; j < t->len; ++j) { + int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); + if (x < 0) continue; // overlap with the first seed + if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 + else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint + } } } -// for (i = 0; i < l; ++i) printf("%d:%d\t", i, qw[i]); putchar('\n'); - } #endif - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); - a->qe = qe + qle; a->re = rmax[0] + re + tle; - free(qw); - } else a->qe = l_query, a->re = s->rbeg + s->len; - /* - a->is_all = 1; - if (c->n > 1) { // check if all the seeds have been included - s = &c->seeds[c->n - 1]; - if (s->qbeg + s->len > a->qe) a->is_all = 0; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); + a->qe = qe + qle; a->re = rmax[0] + re + tle; + free(qw); + } else a->qe = l_query, a->re = s->rbeg + s->len; + if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + // check how many seeds have been covered + for (i = k + 1; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) + break; + } + if (i >= c->n) break; // all seeds are included; no need to proceed + if (a->score > best.score) best = *a; + k = i; } - */ - if (mem_debug >= 2) - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); + if (a->score < best.score) *a = best; free(rseq); } From d8e4d57956d61ae26ea8f6b116ac765cb53e52d3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 21:22:54 -0500 Subject: [PATCH 054/169] Don't use narrow band. I may retry this feature if the profilter indicates that this greatly helps. --- bwamem.c | 22 ++-------------------- ksw.c | 9 ++++----- ksw.h | 2 +- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0e6f3a2..b6cafc7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -376,35 +376,17 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, 0, &qle, &tle); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle); a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; - int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; -#if 0 // FIXME: I am not sure if the following block works. Comment it out if SW extension gives unexpected result. - if (c->n > 1) { // generate $qw - int j, l = rmax[1] - (s->rbeg + s->len); - qw = malloc(l * 2); - for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default - for (i = 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - for (j = 0; j < t->len; ++j) { - int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); - if (x < 0) continue; // overlap with the first seed - if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 - else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint - } - } - } -#endif - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; - free(qw); } else a->qe = l_query, a->re = s->rbeg + s->len; if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // check how many seeds have been covered diff --git a/ksw.c b/ksw.c index 405bd86..08cdf56 100644 --- a/ksw.c +++ b/ksw.c @@ -319,7 +319,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle) { eh_t *eh; // score array int8_t *qp; // query profile @@ -348,15 +348,14 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max = h0, max_i = max_j = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { - int f = 0, h1, m = 0, mj = -1, t; + int f = 0, h1, m = 0, mj = -1; int8_t *q = &qp[target[i] * qlen]; // compute the first column h1 = h0 - (gapo + gape * (i + 1)); if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) - t = (qw && qw[i] >= 0 && qw[i] < w)? qw[i] : w; // this is the band width at $i - if (beg < i - t) beg = i - t; - if (end > i + t + 1) end = i + t + 1; + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; if (end > qlen) end = qlen; for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) diff --git a/ksw.h b/ksw.h index d58f423..c7eaabb 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle); int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); #ifdef __cplusplus From 245505deedfa6020b3e44cb619ee5c9821c16d01 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 22:09:58 -0500 Subject: [PATCH 055/169] minor improvement to mapQ approx. That is not good enough, but I am tired and need rest... --- bwamem.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index b6cafc7..a806a8b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -457,6 +457,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (s->qual) kputsn(s->qual, s->l_seq, &str); else kputc('*', &str); kputc('\n', &str); + goto ret_sam_se; } for (k = 0; k < m; ++k) { uint32_t *cigar = 0; @@ -470,8 +471,14 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - mapq = p->score? (int)(MAPQ_COEF * (1. - (float)(p->sub? p->sub : opt->min_seed_len * opt->a) / p->score) * log(p->score / opt->a) + .499) : 0; - if (mapq > 60) mapq = 60; + { // approximate mapQ + int sub = p->sub? p->sub : opt->min_seed_len * opt->a; + double identity; + mapq = p->score? (int)(MAPQ_COEF * (1. - (float)sub / p->score) * log(p->score / opt->a) + .499) : 0; + identity = (double)p->score / opt->a / (p->qe - p->qb); + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (mapq > 60) mapq = 60; + } kputw(mapq, &str); kputc('\t', &str); if (n_cigar) { int clip5, clip3; @@ -495,6 +502,8 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kputc('\n', &str); free(cigar); } + +ret_sam_se: free(seq); s->sam = str.s; } From 1bf1a674a821731367f966d8a6bc780a9d63366d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 13:43:15 -0500 Subject: [PATCH 056/169] minor improvement to mapQ --- bwamem.c | 30 ++++++++++++++++++++++-------- bwamem.h | 2 +- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index a806a8b..cb064f8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -401,6 +401,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } if (a->score < best.score) *a = best; free(rseq); + + // compute seedcov + if (c->n > 1) { + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + s = &c->seeds[i]; + if (s->qbeg >= a->qb && s->qbeg + s->len <= a->qe && s->rbeg >= a->rb && s->rbeg + s->len <= a->re) // seed fully contained + a->seedcov += s->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + } else a->seedcov = c->seeds[0].len; } uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) @@ -441,6 +450,18 @@ ret_gen_cigar: * Integrated interface * ************************/ +static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +{ + int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; + double identity; + l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; + mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; + identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (mapq > 60) mapq = 60; + return mapq; +} + void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { int i, k, m; @@ -471,14 +492,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - { // approximate mapQ - int sub = p->sub? p->sub : opt->min_seed_len * opt->a; - double identity; - mapq = p->score? (int)(MAPQ_COEF * (1. - (float)sub / p->score) * log(p->score / opt->a) + .499) : 0; - identity = (double)p->score / opt->a / (p->qe - p->qb); - mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (mapq > 60) mapq = 60; - } + mapq = approx_mapq_se(opt, p); kputw(mapq, &str); kputc('\t', &str); if (n_cigar) { int clip5, clip3; diff --git a/bwamem.h b/bwamem.h index 7215ad3..ce3a221 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,7 +31,7 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe, sub; + int score, qb, qe, seedcov, sub; // sub: suboptimal score } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 057b292dde7bca16645c9139c3d9aaccefcd5928 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 14:18:39 -0500 Subject: [PATCH 057/169] exclude identical hits --- bwamem.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index cb064f8..e5bb792 100644 --- a/bwamem.c +++ b/bwamem.c @@ -306,7 +306,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) return n; } -#define alnreg_lt(a, b) ((a).score > (b).score) +#define alnreg_lt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) @@ -314,6 +314,13 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) int i, j, m; if (n <= 1) return n; ks_introsort(mem_ar, n, a); + for (i = 1; i < n; ++i) { // mark identical hits + if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) + a[i].score = 0; + } + for (i = 1, m = 1; i < n; ++i) // exclude identical hits + if (a[i].score > 0) a[m++] = a[i]; + n = m; for (i = 0; i < n; ++i) a[i].sub = 0; for (i = 1, m = 1; i < n; ++i) { for (j = 0; j < m; ++j) { From fdb0a7405fc6eb51100efb546f359a2e16d48450 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 14:46:57 -0500 Subject: [PATCH 058/169] better dealing with microrepeat --- bwamem.c | 6 ++++-- bwamem.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index e5bb792..b2f61a4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -351,7 +351,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, k; + int i, k, csub = 0; int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -403,10 +403,11 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int break; } if (i >= c->n) break; // all seeds are included; no need to proceed - if (a->score > best.score) best = *a; + if (a->score > best.score) csub = best.score, best = *a; k = i; } if (a->score < best.score) *a = best; + a->csub = csub; free(rseq); // compute seedcov @@ -461,6 +462,7 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) { int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; + sub = a->csub > sub? a->csub : sub; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; diff --git a/bwamem.h b/bwamem.h index ce3a221..4b30daf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,7 +31,7 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe, seedcov, sub; // sub: suboptimal score + int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 220fc39e9daf3569ca328a76f5075da50f85c968 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 14:51:24 -0500 Subject: [PATCH 059/169] the previous change does not work... Fixed. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index b2f61a4..b4283fd 100644 --- a/bwamem.c +++ b/bwamem.c @@ -402,8 +402,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; } + if (a->score >= best.score) csub = best.score, best = *a; if (i >= c->n) break; // all seeds are included; no need to proceed - if (a->score > best.score) csub = best.score, best = *a; k = i; } if (a->score < best.score) *a = best; From 2848d3045a30bd2e5af704d68017f08926354287 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 15:34:25 -0500 Subject: [PATCH 060/169] more accurate chain weight --- bwamem.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index b4283fd..4643ea7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -251,8 +251,22 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &chains[i]; - int w = 0; - for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps + int64_t end; + int w = 0, tmp; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->qbeg >= end) w += s->len; + else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + tmp = w; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->rbeg >= end) w += s->len; + else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + w = w < tmp? w : tmp; a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; a[i].p = c; a[i].p2 = 0; From 39607065e04c92099a8239f21be8f4911913bc77 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 16:56:28 -0500 Subject: [PATCH 061/169] allow more seeds to be seen (thus slower..) --- bwamem.c | 20 +++++++++++--------- bwamem.h | 3 ++- fastmap.c | 13 +++++++------ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4643ea7..e1ef4e7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -33,7 +33,8 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; o->min_seed_len = 19; - o->max_occ = 50; + o->split_width = 10; + o->max_occ = 10000; o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; @@ -87,25 +88,26 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) } -const bwtintv_v *smem_next(smem_i *itr, int split_len) +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) { - int i, max, max_i; + int i, max, max_i, ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; int len = (uint32_t)p->info - (p->info>>32); if (max < len) max = len, max_i = i; } - if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long int j; bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging bwtintv_t *p = &itr->matches->a[max_i]; - bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); @@ -113,14 +115,14 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; - } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) { + } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) { kv_push(bwtintv_t, *a, itr->sub->a[j]); ++j; } else ++j; } for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); for (; j < itr->sub->n; ++j) - if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) + if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } @@ -160,7 +162,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; - while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM + while ((a = smem_next(itr, opt->min_seed_len<<1, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; diff --git a/bwamem.h b/bwamem.h index 4b30daf..d415ccf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct { typedef struct { int a, b, q, r, w; + int split_width; int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir, is_pe; @@ -44,7 +45,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); -const bwtintv_v *smem_next(smem_i *itr, int split_len); +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); diff --git a/fastmap.c b/fastmap.c index 0d2354a..e31b0a5 100644 --- a/fastmap.c +++ b/fastmap.c @@ -25,10 +25,11 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:D:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'D') mem_debug = atoi(optarg); + else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -76,7 +77,7 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -85,16 +86,16 @@ int main_fastmap(int argc, char *argv[]) smem_i *itr; const bwtintv_v *a; - while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { + while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { - case 's': split_long = 1; break; + case 's': split_width = atoi(optarg); break; case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] \n", split_width, min_len, min_iwidth); return 1; } @@ -119,7 +120,7 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) { + while ((a = smem_next(itr, min_len<<1, split_width)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; From b2c7148dc93c713a0db428b7357ba1b91fa3b0b0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 17:20:44 -0500 Subject: [PATCH 062/169] consider the number of suboptimal hits --- bwamem.c | 3 +++ bwamem.h | 1 + 2 files changed, 4 insertions(+) diff --git a/bwamem.c b/bwamem.c index e1ef4e7..48a2431 100644 --- a/bwamem.c +++ b/bwamem.c @@ -346,6 +346,7 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; + a[j].sub_n += (double)a[i].score / a[j].sub; break; } } @@ -483,7 +484,9 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (a->sub) mapq -= (int)(4.343 * log(a->sub_n) + .499); if (mapq > 60) mapq = 60; + if (mapq < 0) mapq = 0; return mapq; } diff --git a/bwamem.h b/bwamem.h index d415ccf..f524c8e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -33,6 +33,7 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain + double sub_n; } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 829664d6b5edbaadcc67f9b8a2475dea91b40b69 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 17:55:35 -0500 Subject: [PATCH 063/169] missing identical hits; improved sub_n --- bwamem.c | 11 ++++++----- bwamem.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index 48a2431..92be602 100644 --- a/bwamem.c +++ b/bwamem.c @@ -327,17 +327,18 @@ KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) { // similar to the loop in mem_chain_flt() - int i, j, m; + int i, j, m, tmp; if (n <= 1) return n; ks_introsort(mem_ar, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) - a[i].score = 0; + a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits - if (a[i].score > 0) a[m++] = a[i]; + if (a[i].qe > a[i].qb) a[m++] = a[i]; n = m; for (i = 0; i < n; ++i) a[i].sub = 0; + tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; for (i = 1, m = 1; i < n; ++i) { for (j = 0; j < m; ++j) { int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; @@ -346,7 +347,7 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; - a[j].sub_n += (double)a[i].score / a[j].sub; + if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; break; } } @@ -484,7 +485,7 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (a->sub) mapq -= (int)(4.343 * log(a->sub_n) + .499); + if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; return mapq; diff --git a/bwamem.h b/bwamem.h index f524c8e..c26893a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -33,7 +33,7 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - double sub_n; + int sub_n; } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 95a79afe719011c14c11736b7a239b2e78e848b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 22:11:44 -0500 Subject: [PATCH 064/169] command-line prompt --- fastmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index e31b0a5..990f442 100644 --- a/fastmap.c +++ b/fastmap.c @@ -33,7 +33,10 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n"); + fprintf(stderr, "Usage: bwa mem [options] \n"); + fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, "\n"); free(opt); return 1; From cb55617f50ca20cf231bf4012dca5dee26715091 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 22:12:18 -0500 Subject: [PATCH 065/169] added a new line --- fastmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index 990f442..5093897 100644 --- a/fastmap.c +++ b/fastmap.c @@ -33,7 +33,7 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] \n"); + fprintf(stderr, "Usage: bwa mem [options] \n\n"); fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); From c310fb74242fb6e5f74414d60ac99172375d1ed0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 10 Feb 2013 12:24:33 -0500 Subject: [PATCH 066/169] a little refactoring for PE support --- bwamem.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 92be602..004274e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -322,21 +322,31 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) return n; } -#define alnreg_lt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) -KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) +/****************************** + * De-overlap single-end hits * + ******************************/ -int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) -{ // similar to the loop in mem_chain_flt() - int i, j, m, tmp; +#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) +KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) + +int mem_sort_and_dedup(int n, mem_alnreg_t *a) +{ + int m, i; if (n <= 1) return n; - ks_introsort(mem_ar, n, a); + ks_introsort(mem_ars, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits if (a[i].qe > a[i].qb) a[m++] = a[i]; - n = m; + return m; +} + +int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +{ // similar to the loop in mem_chain_flt() + int i, j, m, tmp; + if (n <= 1) return n; for (i = 0; i < n; ++i) a[i].sub = 0; tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; for (i = 1, m = 1; i < n; ++i) { @@ -357,6 +367,10 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) return m; } +/************************ + * Pick paired-end hits * + ************************/ + /**************************************** * Construct the alignment from a chain * ****************************************/ @@ -493,14 +507,15 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int i, k, m; + int i, k; kstring_t str; char *seq; str.l = str.m = 0; str.s = 0; - m = mem_choose_alnreg_se(opt, a->n, a->a); + a->n = mem_sort_and_dedup(a->n, a->a); + a->n = mem_choose_alnreg_se(opt, a->n, a->a); seq = malloc(s->l_seq); - if (m == 0) { // no seeds found + if (a->n == 0) { // no seeds found for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; kputs(s->name, &str); kputs("\t8\t*\t0\t0\t*\t*\t0\t0\t", &str); kputsn(seq, s->l_seq, &str); @@ -509,7 +524,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kputc('\n', &str); goto ret_sam_se; } - for (k = 0; k < m; ++k) { + for (k = 0; k < a->n; ++k) { uint32_t *cigar = 0; int score, is_rev, nn, rid, flag = 0, n_cigar = 0, mapq = 0; int64_t pos; From f4c0672800f98a8e58e54c2c76f068c60b0bd124 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 10 Feb 2013 12:55:19 -0500 Subject: [PATCH 067/169] move sort_and_dedup() to worker1() --- bwamem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 004274e..7dd55b4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -512,8 +512,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b char *seq; str.l = str.m = 0; str.s = 0; - a->n = mem_sort_and_dedup(a->n, a->a); - a->n = mem_choose_alnreg_se(opt, a->n, a->a); + a->n = mem_choose_alnreg_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() seq = malloc(s->l_seq); if (a->n == 0) { // no seeds found for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; @@ -600,8 +599,10 @@ static void *worker1(void *data) { worker_t *w = (worker_t*)data; int i; - for (i = w->start; i < w->n; i += w->step) + for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); + } return 0; } From 59eaf650ac86620b03539264591e2681d4b55ad4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 10:59:38 -0500 Subject: [PATCH 068/169] code backup --- Makefile | 2 +- bntseq.h | 1 + bwamem.c | 20 +++++++++++++++----- bwamem.h | 10 ++++++++++ bwamem_pair.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fastmap.c | 6 +++--- 6 files changed, 74 insertions(+), 9 deletions(-) create mode 100644 bwamem_pair.c diff --git a/Makefile b/Makefile index 46e0b80..c97fbcf 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ diff --git a/bntseq.h b/bntseq.h index d4096b4..0425540 100644 --- a/bntseq.h +++ b/bntseq.h @@ -29,6 +29,7 @@ #define BWT_BNTSEQ_H #include +#include #include #ifndef BWA_UBYTE diff --git a/bwamem.c b/bwamem.c index 7dd55b4..b17d23b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,7 +14,7 @@ #define MAPQ_COEF 40. -int mem_debug = 0; +int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging void mem_fill_scmat(int a, int b, int8_t mat[25]) { @@ -36,6 +36,7 @@ mem_opt_t *mem_opt_init() o->split_width = 10; o->max_occ = 10000; o->max_chain_gap = 10000; + o->max_ins = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; o->chunk_size = 10000000; @@ -427,7 +428,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // check how many seeds have been covered for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; @@ -574,7 +575,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - if (mem_debug >= 1) mem_print_chain(bns, &chn); + if (mem_verbose >= 4) mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { @@ -593,6 +594,7 @@ typedef struct { const uint8_t *pac; bseq1_t *seqs; mem_alnreg_v *regs; + mem_pestat_t *pes; } worker_t; static void *worker1(void *data) @@ -628,6 +630,8 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns int i; worker_t *w; mem_alnreg_v *regs; + mem_pestat_t pes[4]; + w = calloc(opt->n_threads, sizeof(worker_t)); regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { @@ -635,21 +639,27 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns p->start = i; p->step = opt->n_threads; p->n = n; p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; p->seqs = seqs; p->regs = regs; + p->pes = &pes[0]; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { - worker1(w); worker2(w); + worker1(w); + mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else - worker1(w); worker2(w); + worker1(w); + mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); #endif for (i = 0; i < n; ++i) { fputs(seqs[i].sam, stdout); diff --git a/bwamem.h b/bwamem.h index c26893a..c89abf6 100644 --- a/bwamem.h +++ b/bwamem.h @@ -21,6 +21,7 @@ typedef struct { int n_threads, chunk_size; int pe_dir, is_pe; float mask_level, chain_drop_ratio; + int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; @@ -36,9 +37,16 @@ typedef struct { int sub_n; } mem_alnreg_t; +typedef struct { + int low, high, failed; + double avg, std; +} mem_pestat_t; + typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; +extern int mem_verbose; + #ifdef __cplusplus extern "C" { #endif @@ -58,6 +66,8 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + #ifdef __cplusplus } #endif diff --git a/bwamem_pair.c b/bwamem_pair.c new file mode 100644 index 0000000..2a0079b --- /dev/null +++ b/bwamem_pair.c @@ -0,0 +1,44 @@ +#include +#include "kstring.h" +#include "bwamem.h" +#include "kvec.h" + +#define MIN_RATIO 0.8 + +static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) +{ + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap + } + } + return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; +} + +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) +{ + int i; + kvec_t(int) isize[4]; + memset(isize, 0, sizeof(kvec_t(int)) * 4); + for (i = 0; i < n>>1; i += 2) { + int dir; + int64_t is, pos[2]; + mem_alnreg_v *r[2]; + r[0] = (mem_alnreg_v*)®s[i<<1|0]; + r[1] = (mem_alnreg_v*)®s[i<<1|1]; + if (r[0]->n == 0 || r[1]->n == 0) continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + pos[0] = r[0]->a[0].rb < l_pac? r[0]->a[0].rb : (l_pac<<1) - 1 - r[0]->a[0].rb; // forward coordinate + pos[1] = r[1]->a[0].rb < l_pac? r[1]->a[0].rb : (l_pac<<1) - 1 - r[1]->a[0].rb; + if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); + else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); + is = abs(pos[0] - pos[1]); + if (is <= opt->max_ins) kv_push(int, isize[dir], is); + } + if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); +} diff --git a/fastmap.c b/fastmap.c index 5093897..698b3e1 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,7 +11,6 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; -extern int mem_debug; int main_mem(int argc, char *argv[]) { @@ -25,10 +24,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:v:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'c') opt->max_occ = atoi(optarg); - else if (c == 'D') mem_debug = atoi(optarg); + else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { @@ -37,6 +36,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); return 1; From 987d4b4205c382d6c6c2c1c06af5a52a742d51a4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 11:27:35 -0500 Subject: [PATCH 069/169] fixed a stupid bug in fastq reading --- bseq.c | 6 +++--- bwamem.c | 6 +++--- fastmap.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bseq.c b/bseq.c index 54a25f6..d20b983 100644 --- a/bseq.c +++ b/bseq.c @@ -41,15 +41,15 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) size += seqs[n++].l_seq; if (ks2) { trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n++]); + kseq2bseq1(ks2, &seqs[n]); size += seqs[n++].l_seq; } if (size >= chunk_size) break; } - *n_ = n; - if (size < chunk_size) { // test if the 2nd file is finished + if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } + *n_ = n; return seqs; } diff --git a/bwamem.c b/bwamem.c index b17d23b..f617fcf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -644,21 +644,21 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { worker1(w); - mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else worker1(w); - mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); #endif for (i = 0; i < n; ++i) { diff --git a/fastmap.c b/fastmap.c index 698b3e1..56674f9 100644 --- a/fastmap.c +++ b/fastmap.c @@ -58,7 +58,7 @@ int main_mem(int argc, char *argv[]) ks = kseq_init(fp); if (optind + 2 < argc) { fp2 = gzopen(argv[optind + 2], "r"); - ks2 = kseq_init(fp); + ks2 = kseq_init(fp2); opt->is_pe = 1; } while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { From 4431e359e2bca378ece303233fe54bbf55c64ffa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 12:15:12 -0500 Subject: [PATCH 070/169] analyze isize distribution --- bwamem_pair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++---- bwtsw2_pair.c | 4 ++-- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 2a0079b..99c5c6e 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -1,9 +1,16 @@ #include +#include #include "kstring.h" #include "bwamem.h" #include "kvec.h" #define MIN_RATIO 0.8 +#define MIN_DIR_CNT 10 +#define MIN_DIR_RATIO 0.1 +#define OUTLIER_BOUND 2.0 +#define MAPPING_BOUND 3.0 +#define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { @@ -19,12 +26,16 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; } +typedef kvec_t(uint64_t) vec64_t; + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { - int i; - kvec_t(int) isize[4]; + extern void ks_introsort_uint64_t(size_t n, uint64_t *a); + int i, d; + vec64_t isize[4]; + memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); - for (i = 0; i < n>>1; i += 2) { + for (i = 0; i < n>>1; ++i) { int dir; int64_t is, pos[2]; mem_alnreg_v *r[2]; @@ -38,7 +49,41 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); is = abs(pos[0] - pos[1]); - if (is <= opt->max_ins) kv_push(int, isize[dir], is); + if (is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. + mem_pestat_t *r = &pes[d]; + vec64_t *q = &isize[d]; + int p25, p50, p75, tmp, x; + if (q->n < MIN_DIR_CNT) { + fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + r->failed = 1; + continue; + } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + ks_introsort_uint64_t(q->n, q->a); + p25 = q->a[(int)(.25 * q->n + .499)]; + p50 = q->a[(int)(.50 * q->n + .499)]; + p75 = q->a[(int)(.75 * q->n + .499)]; + r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + if (r->low < 1) r->low = 1; + r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); + for (i = x = 0, r->avg = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->avg += q->a[i], ++x; + r->avg /= x; + for (i = 0, r->std = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); + r->std = sqrt(r->std / x); + fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); + r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); + r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); + if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); + if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); + if (r->low < 1) r->low = 1; + fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index a6f4d80..633641e 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -74,9 +74,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + 3. * (p75 - p25) + .499); - if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); + if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); r.low = tmp > max_len? tmp : max_len; - if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); + if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); free(isize); return r; From b6006cbe9d98b0682701f2f0e6ebe857547f8588 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 13:44:39 -0500 Subject: [PATCH 071/169] skip orientations that are much smaller than best --- bwamem_pair.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 99c5c6e..d537718 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -6,7 +6,7 @@ #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 -#define MIN_DIR_RATIO 0.1 +#define MIN_DIR_RATIO 0.05 #define OUTLIER_BOUND 2.0 #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 @@ -31,7 +31,7 @@ typedef kvec_t(uint64_t) vec64_t; void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); - int i, d; + int i, d, max; vec64_t isize[4]; memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); @@ -51,11 +51,11 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * is = abs(pos[0] - pos[1]); if (is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } - if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; vec64_t *q = &isize[d]; - int p25, p50, p75, tmp, x; + int p25, p50, p75, x; if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; @@ -85,5 +85,13 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); if (r->low < 1) r->low = 1; fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + free(q->a); } + for (d = 0, max = 0; d < 4; ++d) + max = max > isize[d].n? max : isize[d].n; + for (d = 0; d < 4; ++d) + if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { + pes[d].failed = 1; + fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + } } From 99907c98fb99844bd7c9f2023474b67b1924545c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 15:29:03 -0500 Subject: [PATCH 072/169] separated and improved SAM printing code This is for the PE mode. The routines may also be useful for bwa-sw, but probably I won't change the old code. --- bwamem.c | 158 ++++++++++++++++++++++++++++++++------------------ bwamem.h | 9 +++ bwamem_pair.c | 24 ++++++++ kstring.h | 9 +++ 4 files changed, 142 insertions(+), 58 deletions(-) diff --git a/bwamem.c b/bwamem.c index f617fcf..99e8938 100644 --- a/bwamem.c +++ b/bwamem.c @@ -453,7 +453,11 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->seedcov = c->seeds[0].len; } -uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +/***************************** + * Basic hit->SAM conversion * + *****************************/ + +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) { uint32_t *cigar = 0; uint8_t tmp, *rseq; @@ -472,12 +476,12 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); // set the band-width - w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); + w = (int)((double)(l_query * mat[0] - q) / r + 1.); w = w < 1? w : 1; - w = w < opt->w? w : opt->w; + w = w < w_? w : w_; w += abs(rlen - l_query); // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, opt->mat, opt->q, opt->r, w, n_cigar, &cigar); + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; @@ -487,6 +491,82 @@ ret_gen_cigar: return cigar; } +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard) +{ + int score, n_cigar, is_rev, nn, rid, mid, is_unmapped = 0; + uint32_t *cigar = 0; + int64_t pos; + + kputs(s->name, str); + if (p && p->rb >= 0 && p->re < bns->l_pac<<1) { + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + p->flag |= is_rev? 16 : 0; // reverse + p->flag |= p->mb >= 0? 1 : 0; // paired in sequencing + p->flag |= n_cigar == 0? 8 : 0; // FIXME: check why this may happen (this has already happened) + kputc('\t', str); kputw(p->flag, str); kputc('\t', str); + kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); + kputw(p->qual, str); kputc('\t', str); + if (n_cigar) { + int i, clip5, clip3; + clip5 = is_rev? s->l_seq - p->qe : p->qb; + clip3 = is_rev? p->qb : s->l_seq - p->qe; + if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); } + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str); + } + if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } + } else kputc('*', str); + if (p->mb >= 0 && p->mb < bns->l_pac<<1) { // then print mate pos and isize + pos = bns_depos(bns, p->mb < bns->l_pac? p->mb : p->me - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->me - p->mb, &mid); + kputc('\t', str); + if (mid == rid) kputc('=', str); + else kputs(bns->anns[mid].name, str); + kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); + kputc('\t', str); + if (mid != rid) { + int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; + int64_t p1 = p->mb < bns->l_pac? p->mb : (bns->l_pac<<1) - 1 - p->mb; + kputw(abs(p0 - p1), str); + } + kputc('\t', str); + } else kputsn("\t*\t0\t0\t", 7, str); + } else { // unaligned + is_unmapped = 1; + kputw(p? p->flag : 0, str); + kputs("\t*\t0\t0\t*\t*\t0\t0\t", str); + } + if (!is_rev) { // print SEQ and QUAL, the forward strand + int i, qb = 0, qe = s->l_seq; + if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + if (!is_unmapped && p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (!is_unmapped && p->sub >= 0) { kputsn("\tss:i:", 6, str); kputw(p->sub, str); } + kputc('\n', str); + free(cigar); +} + /************************ * Integrated interface * ************************/ @@ -508,61 +588,23 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int i, k; + int k; kstring_t str; - char *seq; - str.l = str.m = 0; str.s = 0; a->n = mem_choose_alnreg_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() - seq = malloc(s->l_seq); - if (a->n == 0) { // no seeds found - for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; - kputs(s->name, &str); kputs("\t8\t*\t0\t0\t*\t*\t0\t0\t", &str); - kputsn(seq, s->l_seq, &str); - if (s->qual) kputsn(s->qual, s->l_seq, &str); - else kputc('*', &str); - kputc('\n', &str); - goto ret_sam_se; - } - for (k = 0; k < a->n; ++k) { - uint32_t *cigar = 0; - int score, is_rev, nn, rid, flag = 0, n_cigar = 0, mapq = 0; - int64_t pos; - mem_alnreg_t *p = &a->a[k]; - cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); - pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - flag |= is_rev? 16 : 0; - if (n_cigar == 0) flag |= 8; - kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); - kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - mapq = approx_mapq_se(opt, p); - kputw(mapq, &str); kputc('\t', &str); - if (n_cigar) { - int clip5, clip3; - clip5 = is_rev? s->l_seq - p->qe : p->qb; - clip3 = is_rev? p->qb : s->l_seq - p->qe; - if (clip5) { kputw(clip5, &str); kputc('S', &str); } - for (i = 0; i < n_cigar; ++i) { - kputw(cigar[i]>>4, &str); kputc("MIDSH"[cigar[i]&0xf], &str); - } - if (clip3) { kputw(clip3, &str); kputc('S', &str); } - } else kputc('*', &str); - kputsn("\t*\t0\t0\t", 7, &str); - if (is_rev) for (i = s->l_seq - 1; i >= 0; --i) seq[i] = "TGCAN"[(int)s->seq[i]]; - else for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; - kputsn(seq, s->l_seq, &str); kputc('\t', &str); - if (s->qual) kputsn(s->qual, s->l_seq, &str); - else kputc('*', &str); - kputsn("\tAS:i:", 6, &str); kputw(p->score, &str); - kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); - kputsn("\tnw:i:", 6, &str); kputw(score, &str); - kputc('\n', &str); - free(cigar); - } - -ret_sam_se: - free(seq); + if (a->n > 0) { + for (k = 0; k < a->n; ++k) { + bwahit_t h; + mem_alnreg_t *p = &a->a[k]; + h.rb = p->rb; h.re = p->re; + h.qb = p->qb; h.qe = p->qe; + h.score = p->score; h.sub = p->sub; + h.flag = 0; + h.qual = approx_mapq_se(opt, p); + h.mb = h.me = -2; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); + } + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->is_hard); s->sam = str.s; } @@ -592,9 +634,9 @@ typedef struct { const bwt_t *bwt; const bntseq_t *bns; const uint8_t *pac; + const mem_pestat_t *pes; bseq1_t *seqs; mem_alnreg_v *regs; - mem_pestat_t *pes; } worker_t; static void *worker1(void *data) @@ -603,7 +645,7 @@ static void *worker1(void *data) int i; for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); - mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); + w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); } return 0; } diff --git a/bwamem.h b/bwamem.h index c89abf6..b95c96d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -20,6 +20,7 @@ typedef struct { int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir, is_pe; + int is_hard; // if to use hard clip float mask_level, chain_drop_ratio; int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset @@ -42,6 +43,14 @@ typedef struct { double avg, std; } mem_pestat_t; +typedef struct { + int64_t rb, re; + int qb, qe, flag, qual; + // optional info + int score, sub; + int64_t mb, me; // mb: mate start; -1 if single-end; -2 if mate unmapped +} bwahit_t; + typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; diff --git a/bwamem_pair.c b/bwamem_pair.c index d537718..f7c1ef8 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -27,6 +27,7 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) } typedef kvec_t(uint64_t) vec64_t; +extern void ks_introsort_uint64_t(size_t n, uint64_t *a); void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { @@ -95,3 +96,26 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); } } + +void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +{ + vec64_t v; + int r, i; + kv_init(v); + for (r = 0; r < 2; ++r) { + for (i = 0; i < a[r].n; ++i) { + int64_t pos; + mem_alnreg_t *e = &a[r].a[i]; + pos = (e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r; + kv_push(uint64_t, v, pos); + } + } + ks_introsort_uint64_t(v.n, v.a); + free(v.a); +} + +void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) +{ + bwahit_t h[2]; + mem_pair(opt, bns, pac, pes, s, a, h); +} diff --git a/kstring.h b/kstring.h index cf14e39..81d7d60 100644 --- a/kstring.h +++ b/kstring.h @@ -16,6 +16,15 @@ typedef struct __kstring_t { } kstring_t; #endif +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { From dcb190069ac9ce249d9985d27ea792eeebc73457 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 16:10:14 -0500 Subject: [PATCH 073/169] PE NOT working, yet --- bwamem_pair.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index f7c1ef8..b465602 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -1,4 +1,5 @@ #include +#include #include #include "kstring.h" #include "bwamem.h" @@ -12,6 +13,11 @@ #define MAX_STDDEV 4.0 #define EXT_STDDEV 4.0 +typedef kvec_t(uint64_t) vec64_t; + +extern void ks_introsort_uint64_t(size_t n, uint64_t *a); +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); + static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; @@ -26,9 +32,6 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; } -typedef kvec_t(uint64_t) vec64_t; -extern void ks_introsort_uint64_t(size_t n, uint64_t *a); - void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); @@ -116,6 +119,12 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) { + kstring_t str; bwahit_t h[2]; + str.l = str.m = 0; str.s = 0; mem_pair(opt, bns, pac, pes, s, a, h); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); + s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); + s[1].sam = str.s; } From 13288e2dcde6437ba9fad8713f17bf3db047f59f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 09:22:47 -0500 Subject: [PATCH 074/169] code backup --- bwamem.c | 4 +++- bwamem_pair.c | 33 +++++++++++++++++++++++++++++---- ksort.h | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index 99e8938..7557af6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -493,7 +493,7 @@ ret_gen_cigar: void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard) { - int score, n_cigar, is_rev, nn, rid, mid, is_unmapped = 0; + int score, n_cigar, is_rev = 0, nn, rid, mid, is_unmapped = 0; uint32_t *cigar = 0; int64_t pos; @@ -652,6 +652,7 @@ static void *worker1(void *data) static void *worker2(void *data) { + extern void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; int i; if (!w->opt->is_pe) { @@ -661,6 +662,7 @@ static void *worker2(void *data) } } else { for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } diff --git a/bwamem_pair.c b/bwamem_pair.c index b465602..019f570 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -103,17 +103,40 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { vec64_t v; - int r, i; + int r, i, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { for (i = 0; i < a[r].n; ++i) { - int64_t pos; + uint64_t key; mem_alnreg_t *e = &a[r].a[i]; - pos = (e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r; - kv_push(uint64_t, v, pos); + key = ((e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r) << 30 | e->score; + kv_push(uint64_t, v, key); } } ks_introsort_uint64_t(v.n, v.a); + y[0] = y[1] = y[2] = y[3] = -1; + printf("**** %ld\n", v.n); + for (i = 0; i < v.n; ++i) { + printf("%lld\t%c\t%lld\t%lld\n", v.a[i]>>32, "+-"[v.a[i]>>31&1], v.a[i]>>30&1, v.a[i]<<34>>34); + for (r = 0; r < 2; ++r) { + int dir = r<<1 | (v.a[i]>>31&1), which, k; + if (pes[dir].failed) continue; // invalid orientation + which = r<<1 | ((v.a[i]>>30&1)^1); + if (y[which] < 0) continue; // no previous hits + for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) + int dist; + double ns; + if ((v.a[k]>>30&3) != which) continue; + dist = (v.a[i]>>32) - (v.a[k]>>32); + printf("%d\t%d\t%d\n", r, which, dist); + if (dist > pes[dir].high) break; + if (dist < pes[dir].low) continue; + ns = (dist - pes[dir].avg) / pes[dir].std; + printf("%f\n", ns); + } + } + y[v.a[i]>>30&3] = i; + } free(v.a); } @@ -123,8 +146,10 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c bwahit_t h[2]; str.l = str.m = 0; str.s = 0; mem_pair(opt, bns, pac, pes, s, a, h); + /* bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); s[1].sam = str.s; + */ } diff --git a/ksort.h b/ksort.h index 52812e1..ad66a17 100644 --- a/ksort.h +++ b/ksort.h @@ -139,7 +139,7 @@ typedef struct { tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ } \ } \ - inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ From e5ab59db5327f628bf688952e1c16f4a4f038b4f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 09:50:28 -0500 Subject: [PATCH 075/169] Multiple changes: 1. Removed bwa.{h,c}. I am not going to finish them anyway. 2. Updated to the latest khash.h, which should be faster. 3. Define 64-bit vector and 128-bit integer/vector in utils.h. --- Makefile | 5 +- bwa.c | 272 ------------------------------------------------ bwa.h | 107 ------------------- bwamem_pair.c | 15 ++- bwape.c | 33 +++--- bwtsw2_pair.c | 4 +- khash.h | 282 ++++++++++++++++++++++++++++++++++---------------- utils.c | 79 +++++++------- utils.h | 13 ++- 9 files changed, 261 insertions(+), 549 deletions(-) delete mode 100644 bwa.c delete mode 100644 bwa.h diff --git a/Makefile b/Makefile index c97fbcf..6a3fc1e 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,9 @@ -CC= gcc -CXX= g++ +CC= clang CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ +LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ diff --git a/bwa.c b/bwa.c deleted file mode 100644 index 8e99f18..0000000 --- a/bwa.c +++ /dev/null @@ -1,272 +0,0 @@ -#include -#include -#include -#include -#include "bwa.h" -#include "bwt.h" -#include "bwtgap.h" -#include "bntseq.h" - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -extern unsigned char nst_nt4_table[256]; -extern void seq_reverse(int len, uint8_t *seq, int is_comp); - -bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; - -struct bwa_idx_t { - bwt_t *bwt; - bntseq_t *bns; - uint8_t *pac; -}; - -struct bwa_buf_t { - int max_buf; - bwa_pestat_t pes; - gap_stack_t *stack; - gap_opt_t *opt; - int *diff_tab; - uint8_t *buf; - int *logn; -}; - -bwa_idx_t *bwa_idx_load(const char *prefix) -{ - bwa_idx_t *p; - int l; - char *str; - l = strlen(prefix); - p = calloc(1, sizeof(bwa_idx_t)); - str = malloc(l + 10); - strcpy(str, prefix); - p->bns = bns_restore(str); - strcpy(str + l, ".bwt"); - p->bwt = bwt_restore_bwt(str); - str[l] = 0; - strcpy(str + l, ".sa"); - bwt_restore_sa(str, p->bwt); - free(str); - p->pac = calloc(p->bns->l_pac/4+1, 1); - fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); - fclose(p->bns->fp_pac); - p->bns->fp_pac = 0; - return p; -} - -void bwa_idx_destroy(bwa_idx_t *p) -{ - bns_destroy(p->bns); - bwt_destroy(p->bwt); - free(p->pac); - free(p); -} - -bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) -{ - extern gap_opt_t *gap_init_opt(void); - extern int bwa_cal_maxdiff(int l, double err, double thres); - int i; - bwa_buf_t *p; - p = malloc(sizeof(bwa_buf_t)); - p->stack = gap_init_stack2(max_score); - p->opt = gap_init_opt(); - p->opt->s_gapo = opt->s_gapo; - p->opt->s_gape = opt->s_gape; - p->opt->max_diff = opt->max_diff; - p->opt->max_gapo = opt->max_gapo; - p->opt->max_gape = opt->max_gape; - p->opt->seed_len = opt->seed_len; - p->opt->max_seed_diff = opt->max_seed_diff; - p->opt->fnr = opt->fnr; - p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); - for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) - p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - p->logn = calloc(256, sizeof(int)); - for (i = 1; i != 256; ++i) - p->logn[i] = (int)(4.343 * log(i) + 0.499); - return p; -} - -void bwa_buf_destroy(bwa_buf_t *p) -{ - gap_destroy_stack(p->stack); - free(p->diff_tab); free(p->logn); free(p->opt); - free(p); -} - -bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) -{ - extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); - int i, seq_len, buf_len; - bwt_width_t *w, *seed_w; - uint8_t *s; - gap_opt_t opt2 = *buf->opt; - bwa_sai_t sai; - - seq_len = strlen(seq); - // estimate the buffer length - buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; - if (buf_len > buf->max_buf) { - buf->max_buf = buf_len; - kroundup32(buf->max_buf); - buf->buf = realloc(buf->buf, buf->max_buf); - } - memset(buf->buf, 0, buf_len); - seed_w = (bwt_width_t*)buf->buf; - w = seed_w + buf->opt->seed_len; - s = (uint8_t*)(w + seq_len + 1); - if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len]; - // copy the sequence - for (i = 0; i < seq_len; ++i) - s[i] = nst_nt4_table[(int)seq[i]]; - seq_reverse(seq_len, s, 0); - // mapping - bwt_cal_width(idx->bwt, seq_len, s, w); - if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; - if (seq_len > buf->opt->seed_len) - bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w); - for (i = 0; i < seq_len; ++i) // complement; I forgot why... - s[i] = s[i] > 3? 4 : 3 - s[i]; - sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack); - return sai; -} - -static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) -{ - uint64_t x = pos, z; - int k, y = 0; - *n_mm = *n_gaps = 0; - for (k = 0; k < n_cigar; ++k) { - int l = cigar[k]>>4; - int op = cigar[k]&0xf; - if (op == 0) { // match/mismatch - for (z = 0; z < l && x + z < l_pac; ++z) { - int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm); - } - } - if (op == 1 || op == 2) (*n_gaps) += l; - if (op == 0 || op == 2) x += l; - if (op == 0 || op == 1 || op == 4) y += l; - } -} - -void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln) -{ - extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); - extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); - int strand, seq_len, i, n_gap, n_mm; - uint64_t pos3, pac_pos; - uint8_t *s[2]; - - memset(aln, 0, sizeof(bwa_aln_t)); - seq_len = strlen(seq); - if (seq_len<<1 > buf->max_buf) { - buf->max_buf = seq_len<<1; - kroundup32(buf->max_buf); - buf->buf = realloc(buf->buf, buf->max_buf); - } - s[0] = buf->buf; - s[1] = s[0] + seq_len; - for (i = 0; i < seq_len; ++i) - s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; - seq_reverse(seq_len, s[1], 1); - pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); - if (strand) aln->flag |= 16; - if (n_gaps) { // only for gapped alignment - int n_cigar; - bwa_cigar_t *cigar16; - cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); - aln->n_cigar = n_cigar; - aln->cigar = malloc(n_cigar * 4); - for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { - int op = cigar16[i]>>14; - int len = cigar16[i]&0x3fff; - if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR - aln->cigar[i] = len<<4 | op; - if (op == 0 || op == 2) pos3 += len; - } - free(cigar16); - } else { // ungapped - aln->n_cigar = 1; - aln->cigar = malloc(4); - aln->cigar[0] = seq_len<<4 | 0; - pos3 = pac_pos + seq_len; - } - aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id); - aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset; - if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence - aln->flag |= 4; // read unmapped - compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap); - aln->n_mm = n_mm; - aln->n_gap = n_gap; -} - -/************************ - * Single-end alignment * - ************************/ - -bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) -{ - bwa_one_t *one; - int best, cnt, i, seq_len; - - seq_len = strlen(seq); - one = calloc(1, sizeof(bwa_one_t)); - one->sai = bwa_sai(idx, buf, seq); - if (one->sai.n == 0) return one; - // count number of hits; randomly select one alignment - best = one->sai.sai[0].score; - for (i = cnt = 0; i < one->sai.n; ++i) { - bwa_sai1_t *p = &one->sai.sai[i]; - if (p->score > best) break; - if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { - one->which = p; - one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); - } - cnt += p->l - p->k + 1; - } - one->c1 = cnt; - for (; i < one->sai.n; ++i) - cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1; - one->c2 = cnt - one->c1; - // estimate single-end mapping quality - one->mapQs = -1; - if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible? - else if (one->c1 > 1) one->mapQs = 0; - else { - int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape; - if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25; - else if (one->c2 == 0) one->mapQs = 37; - } - if (one->mapQs < 0) { - cnt = (one->c2 >= 255)? 255 : one->c2; - one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt]; - } - one->mapQ = one->mapQs; - // compute CIGAR on request - one->one.ref_id = -1; - if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one); - return one; -} - -void bwa_one_destroy(bwa_one_t *one) -{ - free(one->sai.sai); - free(one->one.cigar); - free(one); -} - -/************************ - * Paired-end alignment * - ************************/ - -void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2]) -{ -} - -void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2]) -{ -} diff --git a/bwa.h b/bwa.h deleted file mode 100644 index e8172da..0000000 --- a/bwa.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef BWA_H_ -#define BWA_H_ - -#include - -#define BWA_DEF_MAX_SCORE 2048 -#define BWA_MAX_QUERY_LEN 1024 - -// BWA index -struct bwa_idx_t; -typedef struct bwa_idx_t bwa_idx_t; - -// Buffer for BWA alignment -struct bwa_buf_t; -typedef struct bwa_buf_t bwa_buf_t; - -// BWA alignment options -typedef struct { - int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3 - int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions - int seed_len, max_seed_diff; // seed length and max differences allowed in the seed - float fnr; // parameter for automatic length-adjusted max differences -} bwa_opt_t; - -// default BWA alignment options -extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 } - -// an interval hit in the SA coordinate; basic unit in .sai files -typedef struct { - uint32_t n_mm:16, n_gapo:8, n_gape:8; - int score; - uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits -} bwa_sai1_t; - -// all interval hits in the SA coordinate -typedef struct { - int n; // number of interval hits - bwa_sai1_t *sai; -} bwa_sai_t; - -// an alignment -typedef struct { - uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment - int32_t ref_id; // referece sequence index (the first seq is indexed by 0) - uint32_t offset; // coordinate on the reference; zero-based - uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag - uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations -} bwa_aln_t; - -typedef struct { - int mapQs, mapQ, c1, c2; - uint64_t sa; - bwa_sai1_t *which; - bwa_sai_t sai; - bwa_aln_t one; -} bwa_one_t; - -typedef struct { - double avg, std, ap_prior; - uint64_t low, high, high_bayesian; -} bwa_pestat_t; - -#ifdef __cplusplus -extern "C" { -#endif - - // load a BWA index - bwa_idx_t *bwa_idx_load(const char *prefix); - void bwa_idx_destroy(bwa_idx_t *p); - - // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE - bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score); - void bwa_buf_destroy(bwa_buf_t *p); - - /** - * Find all the SA intervals - * - * @param idx BWA index; multiple threads can share the same index - * @param buf BWA alignment buffer; each thread should have its own buffer - * @param seq NULL terminated C string, consisting of A/C/G/T/N only - * - * @return SA intervals seq is matched to - */ - bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); - - /** - * Construct an alignment in the base-pair coordinate - * - * @param idx BWA index - * @param buf BWA alignment buffer - * @param seq NULL terinated C string - * @param sa Suffix array value - * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape - * - * @return An alignment - */ - void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln); - - bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); - - void bwa_one_destroy(bwa_one_t *one); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/bwamem_pair.c b/bwamem_pair.c index 019f570..845051c 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -4,6 +4,7 @@ #include "kstring.h" #include "bwamem.h" #include "kvec.h" +#include "utils.h" #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 @@ -13,9 +14,6 @@ #define MAX_STDDEV 4.0 #define EXT_STDDEV 4.0 -typedef kvec_t(uint64_t) vec64_t; - -extern void ks_introsort_uint64_t(size_t n, uint64_t *a); void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) @@ -34,9 +32,8 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { - extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, d, max; - vec64_t isize[4]; + uint64_v isize[4]; memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); for (i = 0; i < n>>1; ++i) { @@ -58,14 +55,14 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; - vec64_t *q = &isize[d]; + uint64_v *q = &isize[d]; int p25, p50, p75, x; if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; continue; } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); - ks_introsort_uint64_t(q->n, q->a); + ks_introsort_64(q->n, q->a); p25 = q->a[(int)(.25 * q->n + .499)]; p50 = q->a[(int)(.50 * q->n + .499)]; p75 = q->a[(int)(.75 * q->n + .499)]; @@ -102,7 +99,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { - vec64_t v; + uint64_v v; int r, i, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { @@ -113,7 +110,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con kv_push(uint64_t, v, key); } } - ks_introsort_uint64_t(v.n, v.a); + ks_introsort_64(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; printf("**** %ld\n", v.n); for (i = 0; i < v.n; ++i) { diff --git a/bwape.c b/bwape.c index 779670f..644b5bd 100644 --- a/bwape.c +++ b/bwape.c @@ -21,24 +21,15 @@ typedef struct { bwtint_t low, high, high_bayesian; } isize_info_t; -typedef struct { - uint64_t x, y; -} b128_t; - -#define b128_lt(a, b) ((a).x < (b).x) #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) #define b128_hash(a) ((uint32_t)(a).x) #include "khash.h" -KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) - -#include "ksort.h" -KSORT_INIT(b128, b128_t, b128_lt) -KSORT_INIT_GENERIC(uint64_t) +KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) typedef struct { - kvec_t(b128_t) arr; - kvec_t(b128_t) pos[2]; + pair64_v arr; + pair64_v pos[2]; kvec_t(bwt_aln1_t) aln[2]; } pe_data_t; @@ -120,7 +111,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double free(isizes); return -1; } - ks_introsort(uint64_t, tot, isizes); + ks_introsort_64(tot, isizes); p25 = isizes[(int)(tot*0.25 + 0.5)]; p50 = isizes[(int)(tot*0.50 + 0.5)]; p75 = isizes[(int)(tot*0.75 + 0.5)]; @@ -170,7 +161,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, { int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; uint64_t o_score, subo_score; - b128_t last_pos[2][2], o_pos[2]; + pair64_t last_pos[2][2], o_pos[2]; max_len = p[0]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (low_bound < max_len) low_bound = max_len; @@ -206,11 +197,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, o_score = subo_score = (uint64_t)-1; o_n = subo_n = 0; - ks_introsort(b128, d->arr.n, d->arr.a); + ks_introsort_128(d->arr.n, d->arr.a); for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; if (opt->type == BWA_PET_STD) { for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; + pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if (strand == 1) { // reverse strand, then check int y = 1 - (x.y&1); @@ -223,7 +214,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, } } else if (opt->type == BWA_PET_SOLID) { for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; + pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if ((strand^x.y)&1) { // push int y = 1 - (x.y&1); @@ -345,7 +336,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped - b128_t x; + pair64_t x; int j, k; long long n_occ[2]; for (j = 0; j < 2; ++j) { @@ -360,7 +351,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw bwt_aln1_t *r = d->aln[j].a + k; bwtint_t l; if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table - b128_t key; + pair64_t key; int ret; key.x = r->k; key.y = r->l; khint_t iter = kh_put(b128, g_hash, key, &ret); @@ -377,14 +368,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw for (l = 0; l < kh_val(g_hash, iter).n; ++l) { x.x = kh_val(g_hash, iter).a[l]>>1; x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); x.y = k<<2 | strand<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 633641e..8a8287b 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -6,6 +6,7 @@ #include "bntseq.h" #include "bwtsw2.h" #include "kstring.h" +#include "utils.h" #ifndef _NO_SSE2 #include "ksw.h" #else @@ -24,7 +25,6 @@ typedef struct { bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) { - extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; uint64_t *isize; bsw2pestat_t r; @@ -44,7 +44,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; } - ks_introsort_uint64_t(k, isize); + ks_introsort_64(k, isize); p25 = isize[(int)(.25 * k + .499)]; p50 = isize[(int)(.50 * k + .499)]; p75 = isize[(int)(.75 * k + .499)]; diff --git a/khash.h b/khash.h index de6be6d..2422044 100644 --- a/khash.h +++ b/khash.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2009 by attractor + Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -33,7 +33,6 @@ int main() { khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); @@ -47,6 +46,29 @@ int main() { */ /* + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + 2009-09-26 (0.2.4): * Improve portability @@ -86,11 +108,9 @@ int main() { @header Generic hash table library. - - @copyright Heng Li */ -#define AC_VERSION_KHASH_H "0.2.4" +#define AC_VERSION_KHASH_H "0.2.6" #include #include @@ -111,24 +131,14 @@ typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER -#define inline __inline +#define kh_inline __inline +#else +#define kh_inline inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_HASH_PRIME_SIZE 32 -static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = -{ - 0ul, 3ul, 11ul, 23ul, 53ul, - 97ul, 193ul, 389ul, 769ul, 1543ul, - 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, - 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, - 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, - 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, - 3221225473ul, 4294967291ul -}; - #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) @@ -137,88 +147,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + static const double __ac_HASH_UPPER = 0.77; -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - static inline kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ - static inline void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ } \ } \ - static inline void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khint_t inc, k, i, last; \ - k = __hash_func(key); i = k % h->n_buckets; \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ - khint_t t = __ac_HASH_PRIME_SIZE - 1; \ - while (__ac_prime_list[t] > new_n_buckets) --t; \ - new_n_buckets = __ac_prime_list[t+1]; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ - new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ } \ } \ - if (j) { \ + if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ - while (1) { \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ - i = k % new_n_buckets; \ - inc = 1 + k % (new_n_buckets - 1); \ - while (!__ac_isempty(new_flags, i)) { \ - if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ - else i += inc; \ - } \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ @@ -226,35 +276,39 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); \ + kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ + return 0; \ } \ - static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khint_t inc, k, i, site, last; \ - x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ - if (__ac_isempty(h->flags, i)) x = i; \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ @@ -263,20 +317,20 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (__ac_isempty(h->flags, x)) { \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ - } else *ret = 0; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ - static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -284,6 +338,17 @@ static const double __ac_HASH_UPPER = 0.77; } \ } +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @@ -311,10 +376,10 @@ static const double __ac_HASH_UPPER = 0.77; @param s Pointer to a null terminated string @return The hash value */ -static inline khint_t __ac_X31_hash_string(const char *s) +static kh_inline khint_t __ac_X31_hash_string(const char *s) { - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @@ -328,9 +393,21 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + /* --- END OF HASH FUNCTIONS --- */ -/* Other necessary macros... */ +/* Other convenient macros... */ /*! @abstract Type of the hash table. @@ -396,7 +473,6 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_del(name, h, k) kh_del_##name(h, k) - /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @@ -455,6 +531,34 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_n_buckets(h) ((h)->n_buckets) +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + /* More conenient interfaces */ /*! @function diff --git a/utils.c b/utils.c index 8c1ad7e..41594c3 100644 --- a/utils.c +++ b/utils.c @@ -35,6 +35,12 @@ #include #include "utils.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) + +#include "ksort.h" +KSORT_INIT(128, pair64_t, pair64_lt) +KSORT_INIT(64, uint64_t, ks_lt_generic) + FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; @@ -46,6 +52,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode) } return fp; } + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) { if (freopen(fn, mode, fp) == 0) { @@ -56,6 +63,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE } return fp; } + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; @@ -67,6 +75,7 @@ gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) } return fp; } + void err_fatal(const char *header, const char *fmt, ...) { va_list args; @@ -86,66 +95,48 @@ void err_fatal_simple_core(const char *func, const char *msg) size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { - size_t ret = fwrite(ptr, size, nmemb, stream); - if (ret != nmemb) - { - err_fatal_simple_core("fwrite", strerror(errno)); - } - return ret; + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + err_fatal_simple_core("fwrite", strerror(errno)); + return ret; } int err_printf(const char *format, ...) { - va_list arg; - int done; - - va_start(arg, format); - done = vfprintf(stdout, format, arg); - int saveErrno = errno; - va_end(arg); - - if (done < 0) - { - err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); - } - return done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); + return done; } int err_fprintf(FILE *stream, const char *format, ...) { - va_list arg; - int done; - - va_start(arg, format); - done = vfprintf(stream, format, arg); - int saveErrno = errno; - va_end(arg); - - if (done < 0) - { - err_fatal_simple_core("vfprintf", strerror(saveErrno)); - } - return done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) err_fatal_simple_core("vfprintf", strerror(saveErrno)); + return done; } int err_fflush(FILE *stream) { - int ret = fflush(stream); - if (ret != 0) - { - err_fatal_simple_core("fflush", strerror(errno)); - } - return ret; + int ret = fflush(stream); + if (ret != 0) err_fatal_simple_core("fflush", strerror(errno)); + return ret; } int err_fclose(FILE *stream) { - int ret = fclose(stream); - if (ret != 0) - { - err_fatal_simple_core("fclose", strerror(errno)); - } - return ret; + int ret = fclose(stream); + if (ret != 0) err_fatal_simple_core("fclose", strerror(errno)); + return ret; } double cputime() diff --git a/utils.h b/utils.h index b6839e9..5abab41 100644 --- a/utils.h +++ b/utils.h @@ -28,6 +28,7 @@ #ifndef LH3_UTILS_H #define LH3_UTILS_H +#include #include #include @@ -38,14 +39,19 @@ #define ATTRIBUTE(list) #endif - - #define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) #define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) +typedef struct { + uint64_t x, y; +} pair64_t; + +typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; pair64_t *a; } pair64_v; + #ifdef __cplusplus extern "C" { #endif @@ -66,6 +72,9 @@ extern "C" { double cputime(); double realtime(); + void ks_introsort_64 (size_t n, uint64_t *a); + void ks_introsort_128(size_t n, pair64_t *a); + #ifdef __cplusplus } #endif From 6ad5a3c086c82139f6f30f191fbfcc3cae2ec542 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:21:17 -0500 Subject: [PATCH 076/169] removed color-space support which has been broken since 0.6.x --- Makefile | 3 +- bwape.c | 38 ++--------- bwase.c | 45 ++----------- bwase.h | 2 +- bwtaln.c | 4 +- bwtaln.h | 1 - bwtindex.c | 24 +------ cs2nt.c | 191 ---------------------------------------------------- main.c | 3 - main.h | 2 - simple_dp.c | 162 -------------------------------------------- 11 files changed, 17 insertions(+), 458 deletions(-) delete mode 100644 cs2nt.c delete mode 100644 simple_dp.c diff --git a/Makefile b/Makefile index 6a3fc1e..8cf767a 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,7 @@ DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ - is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ - bwape.o cs2nt.o \ + is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwape.c b/bwape.c index 644b5bd..4201cf2 100644 --- a/bwape.c +++ b/bwape.c @@ -212,19 +212,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, last_pos[x.y&1][1] = x; } } - } else if (opt->type == BWA_PET_SOLID) { - for (i = 0; i < d->arr.n; ++i) { - pair64_t x = d->arr.a[i]; - int strand = x.y>>1&1; - if ((strand^x.y)&1) { // push - int y = 1 - (x.y&1); - __pairing_aux(last_pos[y][1], x); - __pairing_aux(last_pos[y][0], x); - } else { // check - last_pos[x.y&1][0] = last_pos[x.y&1][1]; - last_pos[x.y&1][1] = x; - } - } } else { fprintf(stderr, "[paring] not implemented yet!\n"); exit(1); @@ -567,11 +554,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, ++n_tot[is_singleton]; cigar[0] = cigar[1] = 0; n_cigar[0] = n_cigar[1] = 0; - if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified ubyte_t *seq; if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip - if (popt->type == BWA_PET_STD) { + { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate __set_rght_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->rseq; @@ -580,17 +567,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, seq = p[k]->seq; seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly } - } else { // BWA_PET_SOLID - if (p[1-k]->strand == 0) { // R3-F3 pairing - if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->rseq; - seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed - } else { // F3-R3 pairing - if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->seq; - } } // perform SW alignment cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); @@ -654,7 +630,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa[2]; gap_opt_t opt, opt0; khint_t iter; @@ -679,10 +655,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f opt0 = opt; fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); - if (!(opt.mode & BWA_MODE_COMPREAD)) { - popt->type = BWA_PET_SOLID; - ntbns = bwa_open_nt(prefix); - } else { // for Illumina alignment only + { // for Illumina alignment only if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); @@ -715,7 +688,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); for (j = 0; j < 2; ++j) - bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); if (pac == 0) free(pacseq); @@ -740,7 +713,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // destroy bns_destroy(bns); - if (ntbns) bns_destroy(ntbns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); fclose(fp_sa[i]); diff --git a/bwase.c b/bwase.c index 35744e7..8fa79ac 100644 --- a/bwase.c +++ b/bwase.c @@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s) s->len = s->full_len; } -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { - ubyte_t *pacseq, *ntpac = 0; + ubyte_t *pacseq; int i, j; kstring_t *str; - if (ntbns) { // in color space - ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); - rewind(ntbns->fp_pac); - fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); - } - if (!_pacseq) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); rewind(bns->fp_pac); @@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } -#if 0 - if (ntbns) { // in color space - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - bwa_cs2nt_core(s, bns->l_pac, ntpac); - for (j = 0; j < s->n_multi; ++j) { - bwt_multi1_t *q = s->multi + j; - int n_cigar; - if (q->gap == 0) continue; - free(q->cigar); - q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 0); - q->n_cigar = n_cigar; - } - if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again - free(s->cigar); - s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); - } - } - } -#endif // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { @@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t if (s->type != BWA_TYPE_NO_MATCH) { int nm; s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + bns->l_pac, pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads - if (!ntbns) // trimming is only enabled for Illumina reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); if (!_pacseq) free(pacseq); - free(ntpac); } int64_t pos_end(const bwa_seq_t *p) @@ -587,7 +557,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; @@ -599,8 +569,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac - ntbns = bwa_open_nt(prefix); bwa_print_sam_SQ(bns); //bwa_print_sam_PG(); // set ks @@ -628,7 +596,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); @@ -642,7 +610,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // destroy bwa_seq_close(ks); - if (ntbns) bns_destroy(ntbns); bns_destroy(bns); fclose(fp_sa); free(aln); diff --git a/bwase.h b/bwase.h index f8e9b0a..26a9f68 100644 --- a/bwase.h +++ b/bwase.h @@ -14,7 +14,7 @@ extern "C" { // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. diff --git a/bwtaln.c b/bwtaln.c index efc7f66..84be510 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -252,7 +252,7 @@ int bwa_aln(int argc, char *argv[]) char *prefix; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -272,7 +272,6 @@ int bwa_aln(int argc, char *argv[]) case 'L': opt->mode |= BWA_MODE_LOGGAP; break; case 'R': opt->max_top2 = atoi(optarg); break; case 'q': opt->trim_qual = atoi(optarg); break; - case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; case 'b': opt->mode |= BWA_MODE_BAM; break; @@ -310,7 +309,6 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); fprintf(stderr, " -B INT length of barcode\n"); -// fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); diff --git a/bwtaln.h b/bwtaln.h index 39eaf4b..412cc04 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -107,7 +107,6 @@ typedef struct { } gap_opt_t; #define BWA_PET_STD 1 -#define BWA_PET_SOLID 2 typedef struct { int max_isize, force_isize; diff --git a/bwtindex.c b/bwtindex.c index 938e982..c01fa95 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -42,11 +42,11 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev); int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_color = 0, is_64 = 0; + int c, algo_type = 0, is_64 = 0; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { + while ((c = getopt(argc, argv, "6a:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -55,7 +55,6 @@ int bwa_index(int argc, char *argv[]) else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; - case 'c': is_color = 1; break; case '6': is_64 = 1; break; default: return 1; } @@ -67,7 +66,6 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); -// fprintf(stderr, " -c build color-space index\n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); @@ -83,29 +81,13 @@ int bwa_index(int argc, char *argv[]) str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); - if (is_color == 0) { // nucleotide indexing + { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); - } else { // color indexing - gzFile fp = xzopen(argv[optind], "r"); - strcat(strcpy(str, prefix), ".nt"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); - { - char *tmp_argv[3]; - tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; - t = clock(); - fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); - bwa_pac2cspac(3, tmp_argv); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { diff --git a/cs2nt.c b/cs2nt.c deleted file mode 100644 index dfbce60..0000000 --- a/cs2nt.c +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include -#include -#include "bwtaln.h" -#include "stdaln.h" - -/* - Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we - decode as ATTGAC(RBGOG), there are one color change and one nt change; - if we decode as ATTAAC(RBRBG), there are two color changes. - - In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM - as the penalty; otherwise, we will use color quality as the - penalty. This means we always prefer two consistent color changes over - a nt change, but if a color has high quality, we may prefer one nt - change. - - In the above example, the penalties of the two types of decoding are - q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first; - otherwise the second. Note that no matter what we choose, the fourth - base will get a low nt quality. - */ - -#define COLOR_MM 19 -#define NUCL_MM 25 - -static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 }; - -/* - {A,C,G,T,N} -> {0,1,2,3,4} - nt_ref[0..size]: nucleotide reference: 0/1/2/3/4 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned) - btarray[0..4*size]: backtrack array (working space) - */ -void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray) -{ - int h[8], curr, last; - int x, y, xmin, hmin, k; - - // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last - - // recursion: initial value - if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2); - else { - for (x = 0; x != 4; ++x) h[x] = NUCL_MM; - h[nt_ref[0]] = 0; - } - // recursion: main loop - curr = 1; last = 0; - for (k = 1; k <= size; ++k) { - for (x = 0; x != 4; ++x) { - int min = 0x7fffffff, ymin = 0; - for (y = 0; y != 4; ++y) { - int s = h[last<<2|y]; - if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<= 0; --k) - nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]]; -} -/* - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - tarray[0..size*2-1]: temporary array - */ -uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray) -{ - int k, c1, c2; - uint8_t *t2array = tarray + size; - // get the color sequence of nt_read - c1 = nt_read[0]; - for (k = 1; k <= size; ++k) { - c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case - tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<>6 && tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10; - } else if (tarray[k-1] == cs_read[k-1]>>6) { - q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f); - } else if (tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f); - } // else, q = 0 - if (q < 0) q = 0; - if (q > 60) q = 60; - t2array[k] = nt_read[k]<<6 | q; - if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0; - } - return t2array + 1; // of size-2 -} - -// this function will be called when p->seq has been reversed by refine_gapped() -void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) -{ - uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read; - int i, len; - uint8_t *seq; - - // set temporary arrays - if (p->type == BWA_TYPE_NO_MATCH) return; - len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space - ta = (uint8_t*)malloc(len * 7); - nt_ref = ta; - cs_read = nt_ref + len; - nt_read = cs_read + len; - btarray = nt_read + len; - tarray = nt_read + len; - -#define __gen_csbase(_cs, _i, _seq) do { \ - int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \ - if (q > 60) q = 60; \ - if (_seq[_i] > 3) q = 63; \ - (_cs) = _seq[_i]<<6 | q; \ - } while (0) - - // generate len, nt_ref[] and cs_read - seq = p->strand? p->rseq : p->seq; - nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4; - if (p->cigar == 0) { // no gap or clipping - len = p->len; - for (i = 0; i < p->len; ++i) { - __gen_csbase(cs_read[i], i, seq); - nt_ref[i+1] = bns_pac(pac, p->pos + i); - } - } else { - int k, z; - bwtint_t x, y; - x = p->pos; y = 0; - for (k = z = 0; k < p->n_cigar; ++k) { - int l = __cigar_len(p->cigar[k]); - if (__cigar_op(p->cigar[k]) == FROM_M) { - for (i = 0; i < l; ++i, ++x, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = bns_pac(pac, x); - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_I) { - for (i = 0; i < l; ++i, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = 4; - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l; - else x += l; - } - len = z; - } - - cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray); - new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray); - - // update p - p->len = p->full_len = len - 1; - for (i = 0; i < p->len; ++i) { - if ((new_nt_read[i]&0x3f) == 63) { - p->qual[i] = 33; seq[i] = 4; - } else { - p->qual[i] = (new_nt_read[i]&0x3f) + 33; - seq[i] = new_nt_read[i]>>6; - } - } - p->qual[p->len] = seq[p->len] = 0; - if (p->strand) { - memcpy(p->seq, seq, p->len); - seq_reverse(p->len, p->seq, 1); - seq_reverse(p->len, p->qual, 0); - } else { - memcpy(p->rseq, seq, p->len); - seq_reverse(p->len, p->rseq, 1); - } - free(ta); -} diff --git a/main.c b/main.c index 2718732..fc63c2e 100644 --- a/main.c +++ b/main.c @@ -28,7 +28,6 @@ static int usage() fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); - fprintf(stderr, " stdsw standard SW/NW alignment\n"); fprintf(stderr, "\n"); return 1; } @@ -51,11 +50,9 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); - else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); diff --git a/main.h b/main.h index 1a0292a..7b638ca 100644 --- a/main.h +++ b/main.h @@ -17,8 +17,6 @@ extern "C" { int bwa_sai2sam_se(int argc, char *argv[]); int bwa_sai2sam_pe(int argc, char *argv[]); - int bwa_stdsw(int argc, char *argv[]); - int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); diff --git a/simple_dp.c b/simple_dp.c deleted file mode 100644 index d2b4b71..0000000 --- a/simple_dp.c +++ /dev/null @@ -1,162 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "stdaln.h" -#include "utils.h" - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -typedef struct { - int l; - unsigned char *s; - char *n; -} seq1_t; - -typedef struct { - int n_seqs, m_seqs; - seq1_t *seqs; -} seqs_t; - -unsigned char aln_rev_table[256] = { - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', - 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', - 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N', - 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' -}; - -static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0; -static AlnParam g_aln_param; - -static void revseq(int len, uint8_t *seq) -{ - int i; - for (i = 0; i < len>>1; ++i) { - uint8_t tmp = aln_rev_table[seq[len-1-i]]; - seq[len-1-i] = aln_rev_table[seq[i]]; - seq[i] = tmp; - } - if (len&1) seq[i] = aln_rev_table[seq[i]]; -} - -static seqs_t *load_seqs(const char *fn) -{ - seqs_t *s; - seq1_t *p; - gzFile fp; - int l; - kseq_t *seq; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - s = (seqs_t*)calloc(1, sizeof(seqs_t)); - s->m_seqs = 256; - s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); - while ((l = kseq_read(seq)) >= 0) { - if (s->n_seqs == s->m_seqs) { - s->m_seqs <<= 1; - s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); - } - p = s->seqs + (s->n_seqs++); - p->l = seq->seq.l; - p->s = (unsigned char*)malloc(p->l + 1); - memcpy(p->s, seq->seq.s, p->l); - p->s[p->l] = 0; - p->n = strdup((const char*)seq->name.s); - } - kseq_destroy(seq); - gzclose(fp); - fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); - return s; -} - -static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) -{ - int i; - for (i = 0; i < ss->n_seqs; ++i) { - AlnAln *aa; - seq1_t *p = ss->seqs + i; - g_aln_param.band_width = l + p->l; - aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); - if (aa->score >= g_thres || g_is_global) { - printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, - aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); - // NB: I put the short sequence as the first sequence in SW, an insertion to - // the reference becomes a deletion from the short sequence. Therefore, I use - // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. - for (i = 0; i != aa->n_cigar; ++i) - printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); - } - aln_free_AlnAln(aa); - } -} - -static void aln_seqs(const seqs_t *ss, const char *fn) -{ - gzFile fp; - kseq_t *seq; - int l; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+'); - if (g_strand&2) { - revseq(l, (uint8_t*)seq->seq.s); - aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-'); - } - } - kseq_destroy(seq); - gzclose(fp); -} - -int bwa_stdsw(int argc, char *argv[]) -{ - int c; - seqs_t *ss; - - while ((c = getopt(argc, argv, "gT:frp")) >= 0) { - switch (c) { - case 'g': g_is_global = 1; break; - case 'T': g_thres = atoi(optarg); break; - case 'f': g_strand |= 1; break; - case 'r': g_strand |= 2; break; - case 'p': g_aa = 1; break; - } - } - if (g_strand == 0) g_strand = 3; - if (g_aa) g_strand = 1; - if (optind + 1 >= argc) { - fprintf(stderr, "\nUsage: bwa stdsw [options] \n\n"); - fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres); - fprintf(stderr, " -p protein alignment (suppressing -r)\n"); - fprintf(stderr, " -f forward strand only\n"); - fprintf(stderr, " -r reverse strand only\n"); - fprintf(stderr, " -g global alignment\n\n"); - fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n"); - fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n"); - fprintf(stderr, " sequence.\n\n"); - return 1; - } - g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast; - g_aln_param.gap_end = 0; - ss = load_seqs(argv[optind]); - aln_seqs(ss, argv[optind+1]); - return 0; -} From 95d18449b385d75bd8e5ffebfbdb39bfeb526e8e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:36:15 -0500 Subject: [PATCH 077/169] merge bseq.{h,c} to utils.{h,c} I do not like many small files. --- Makefile | 2 +- bseq.c | 55 ---------------------------------------------- bseq.h | 11 ---------- bwamem.c | 1 + bwamem.h | 7 +++--- bwtsw2_aux.c | 1 - fastmap.c | 2 +- utils.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-- utils.h | 7 ++++++ 9 files changed, 73 insertions(+), 75 deletions(-) delete mode 100644 bseq.c delete mode 100644 bseq.h diff --git a/Makefile b/Makefile index 8cf767a..334616c 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ - bseq.o bwaseqio.o bwase.o kstring.o + bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/bseq.c b/bseq.c deleted file mode 100644 index d20b983..0000000 --- a/bseq.c +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include -#include -#include -#include "bseq.h" -#include "kseq.h" -KSEQ_INIT2(, gzFile, gzread) - -static inline void trim_readno(kstring_t *s) -{ - if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) - s->l -= 2, s->s[s->l] = 0; -} - -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) -{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; - s->seq = strdup(ks->seq.s); - s->qual = ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = strlen(s->seq); -} - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; - int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; - while (kseq_read(ks) >= 0) { - if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads - fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); - break; - } - if (n >= m) { - m = m? m<<1 : 256; - seqs = realloc(seqs, m * sizeof(bseq1_t)); - } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - size += seqs[n++].l_seq; - } - if (size >= chunk_size) break; - } - if (size == 0) { // test if the 2nd file is finished - if (ks2 && kseq_read(ks2) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); - } - *n_ = n; - return seqs; -} diff --git a/bseq.h b/bseq.h deleted file mode 100644 index 978312a..0000000 --- a/bseq.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef BATCHSEQ_H_ -#define BATCHSEQ_H_ - -typedef struct { - int l_seq; - char *name, *comment, *seq, *qual, *sam; -} bseq1_t; - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - -#endif diff --git a/bwamem.c b/bwamem.c index 7557af6..8d0494d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -10,6 +10,7 @@ #include "bwamem.h" #include "bntseq.h" #include "ksw.h" +#include "kvec.h" #include "ksort.h" #define MAPQ_COEF 40. diff --git a/bwamem.h b/bwamem.h index b95c96d..4e2e5ce 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,8 +3,7 @@ #include "bwt.h" #include "bntseq.h" -#include "bseq.h" -#include "kvec.h" +#include "utils.h" struct __smem_i; typedef struct __smem_i smem_i; @@ -51,8 +50,8 @@ typedef struct { int64_t mb, me; // mb: mate start; -1 if single-end; -2 if mate unmapped } bwahit_t; -typedef kvec_t(mem_chain_t) mem_chain_v; -typedef kvec_t(mem_alnreg_t) mem_alnreg_v; +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; extern int mem_verbose; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index a18ffc8..55c7c64 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,7 +13,6 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" -#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/fastmap.c b/fastmap.c index 56674f9..f2677eb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -6,7 +6,7 @@ #include "bwt.h" #include "bwamem.h" #include "kvec.h" -#include "bseq.h" +#include "utils.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/utils.c b/utils.c index 41594c3..127c8fe 100644 --- a/utils.c +++ b/utils.c @@ -35,9 +35,8 @@ #include #include "utils.h" -#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) - #include "ksort.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) @@ -139,6 +138,10 @@ int err_fclose(FILE *stream) return ret; } +/********* + * Timer * + *********/ + double cputime() { struct rusage r; @@ -153,3 +156,58 @@ double realtime() gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} diff --git a/utils.h b/utils.h index 5abab41..6c065c1 100644 --- a/utils.h +++ b/utils.h @@ -52,6 +52,11 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +typedef struct { + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + #ifdef __cplusplus extern "C" { #endif @@ -75,6 +80,8 @@ extern "C" { void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + #ifdef __cplusplus } #endif From cfdc938fc316e0e4f9764636f87a87f349513549 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:39:16 -0500 Subject: [PATCH 078/169] to exclude "test64" --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 16d123a..57cb318 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.[oa] bwa test +test64 .*.swp From 2fc469d0c9b459c28caf6618520eff948e886a58 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 12:09:36 -0500 Subject: [PATCH 079/169] code backup --- Makefile | 2 +- bwamem.c | 18 +++++++++++------- bwamem_pair.c | 42 +++++++++++++++++++++++++----------------- bwape.c | 13 ------------- utils.h | 13 +++++++++++++ 5 files changed, 50 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 334616c..2c060e9 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ SUBDIRS= . all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) diff --git a/bwamem.c b/bwamem.c index 8d0494d..b44e54e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -587,6 +587,15 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) return mapq; } +void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) +{ + h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; + h->score = a->score; + h->sub = a->sub > a->csub? a->sub : a->csub; + h->qual = h->flag = 0; // these are unset + h->mb = h->me = -2; // mate positions are unset +} + void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { int k; @@ -596,13 +605,8 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (a->n > 0) { for (k = 0; k < a->n; ++k) { bwahit_t h; - mem_alnreg_t *p = &a->a[k]; - h.rb = p->rb; h.re = p->re; - h.qb = p->qb; h.qe = p->qe; - h.score = p->score; h.sub = p->sub; - h.flag = 0; - h.qual = approx_mapq_se(opt, p); - h.mb = h.me = -2; + mem_alnreg2hit(&a->a[k], &h); + h.qual = approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->is_hard); diff --git a/bwamem_pair.c b/bwamem_pair.c index 845051c..6b44f78 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -99,40 +99,48 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { - uint64_v v; + pair64_v v; + pair64_t o, subo; // score<<32 | raw_score<<8 | hash int r, i, y[4]; // y[] keeps the last hit kv_init(v); - for (r = 0; r < 2; ++r) { + for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { - uint64_t key; + pair64_t key; mem_alnreg_t *e = &a[r].a[i]; - key = ((e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r) << 30 | e->score; - kv_push(uint64_t, v, key); + key.x = e->rb < bns->l_pac? e->rb : (bns->l_pac<<1) - 1 - e->rb; // forward position + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= bns->l_pac)<<1 | r; + kv_push(pair64_t, v, key); } } - ks_introsort_64(v.n, v.a); + ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - printf("**** %ld\n", v.n); + o.x = o.y = subo.x = subo.y = 0; for (i = 0; i < v.n; ++i) { - printf("%lld\t%c\t%lld\t%lld\n", v.a[i]>>32, "+-"[v.a[i]>>31&1], v.a[i]>>30&1, v.a[i]<<34>>34); - for (r = 0; r < 2; ++r) { - int dir = r<<1 | (v.a[i]>>31&1), which, k; + for (r = 0; r < 2; ++r) { // loop through direction + int dir = r<<1 | (v.a[i].y>>1&1), which, k; if (pes[dir].failed) continue; // invalid orientation - which = r<<1 | ((v.a[i]>>30&1)^1); + which = r<<1 | ((v.a[i].y&1)^1); if (y[which] < 0) continue; // no previous hits for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) - int dist; + int64_t dist; + int raw_score, score; double ns; - if ((v.a[k]>>30&3) != which) continue; - dist = (v.a[i]>>32) - (v.a[k]>>32); - printf("%d\t%d\t%d\n", r, which, dist); + uint64_t x, pair; + if ((v.a[k].y&3) != which) continue; + dist = (int64_t)v.a[i].x - v.a[k].x; if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; + raw_score = (v.a[i].y>>32) + (v.a[i].y>>32); + if (raw_score + 20 * opt->a < (subo.x>>8&0xffffff)) continue; // skip the following if the score is too small ns = (dist - pes[dir].avg) / pes[dir].std; - printf("%f\n", ns); + score = (int)(23. * raw_score / (opt->a + opt->b) - 4.343 * log(.5 * erfc(fabs(ns) * M_SQRT1_2)) + .499); + pair = (uint64_t)k<<32 | i; + x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair)&0xff); + if (x > o.x) subo = o, o.x = x, o.y = pair; + else if (x > subo.x) subo.x = x, subo.y = pair; } } - y[v.a[i]>>30&3] = i; + y[v.a[i].y&3] = i; } free(v.a); } diff --git a/bwape.c b/bwape.c index 4201cf2..77ae1fa 100644 --- a/bwape.c +++ b/bwape.c @@ -60,19 +60,6 @@ pe_opt_t *bwa_init_pe_opt() po->ap_prior = 1e-5; return po; } - -static inline uint64_t hash_64(uint64_t key) -{ - key += ~(key << 32); - key ^= (key >> 22); - key += ~(key << 13); - key ^= (key >> 8); - key += (key << 3); - key ^= (key >> 15); - key += ~(key << 27); - key ^= (key >> 31); - return key; -} /* static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); { diff --git a/utils.h b/utils.h index 6c065c1..70f4e11 100644 --- a/utils.h +++ b/utils.h @@ -86,4 +86,17 @@ extern "C" { } #endif +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} + #endif From 22b79b3475700160b557c7db0bc770371e85c21f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 15:34:44 -0500 Subject: [PATCH 080/169] mark primary, instead of dropping secondary --- bwamem.c | 24 +++++++++++++++--------- bwamem.h | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bwamem.c b/bwamem.c index b44e54e..68078d0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -345,14 +345,18 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a) return m; } -int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function { // similar to the loop in mem_chain_flt() - int i, j, m, tmp; - if (n <= 1) return n; - for (i = 0; i < n; ++i) a[i].sub = 0; + int i, k, tmp; + kvec_t(int) z; + if (n == 0) return; + kv_init(z); + for (i = 0; i < n; ++i) a[i].sub = a[i].is_primary = 0; tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; - for (i = 1, m = 1; i < n; ++i) { - for (j = 0; j < m; ++j) { + kv_push(int, z, 0); + for (i = 1; i < n; ++i) { + for (k = 0; k < z.n; ++k) { + int j = z.a[k]; int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; if (e_min > b_max) { // have overlap @@ -364,9 +368,10 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT } } } - if (j == m) a[m++] = a[i]; + if (k == z.n) kv_push(int, z, i); } - return m; + for (k = 0; k < z.n; ++k) a[z.a[k]].is_primary = 1; + free(z.a); } /************************ @@ -601,10 +606,11 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int k; kstring_t str; str.l = str.m = 0; str.s = 0; - a->n = mem_choose_alnreg_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() if (a->n > 0) { + mem_mark_primary_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() for (k = 0; k < a->n; ++k) { bwahit_t h; + if (!a->a[k].is_primary) continue; mem_alnreg2hit(&a->a[k], &h); h.qual = approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); diff --git a/bwamem.h b/bwamem.h index 4e2e5ce..d511254 100644 --- a/bwamem.h +++ b/bwamem.h @@ -34,7 +34,7 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n; + int sub_n, is_primary; } mem_alnreg_t; typedef struct { From cd0969332f6804db96357871321263f0196e9e6a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 15:52:23 -0500 Subject: [PATCH 081/169] keep track of the "parent" of a secondary --- bwamem.c | 9 +++++---- bwamem.h | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 68078d0..c2b0eed 100644 --- a/bwamem.c +++ b/bwamem.c @@ -351,7 +351,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT kvec_t(int) z; if (n == 0) return; kv_init(z); - for (i = 0; i < n; ++i) a[i].sub = a[i].is_primary = 0; + for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1; tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; kv_push(int, z, 0); for (i = 1; i < n; ++i) { @@ -369,8 +369,8 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT } } if (k == z.n) kv_push(int, z, i); + else a[i].secondary = z.a[k]; } - for (k = 0; k < z.n; ++k) a[z.a[k]].is_primary = 1; free(z.a); } @@ -597,7 +597,8 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; h->score = a->score; h->sub = a->sub > a->csub? a->sub : a->csub; - h->qual = h->flag = 0; // these are unset + h->qual = 0; // quality unset + h->flag = a->secondary? 0x100 : 0; // only the "secondary" bit is set h->mb = h->me = -2; // mate positions are unset } @@ -610,7 +611,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_mark_primary_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() for (k = 0; k < a->n; ++k) { bwahit_t h; - if (!a->a[k].is_primary) continue; + if (a->a[k].secondary >= 0) continue; mem_alnreg2hit(&a->a[k], &h); h.qual = approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); diff --git a/bwamem.h b/bwamem.h index d511254..3ac15d0 100644 --- a/bwamem.h +++ b/bwamem.h @@ -34,7 +34,8 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n, is_primary; + int sub_n; // approximate number of suboptimal hits + int secondary; // non-negative if the hit is secondary } mem_alnreg_t; typedef struct { From 325ba8213b1865ff28d794c6beb17741c7bc75ed Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 15:54:55 -0500 Subject: [PATCH 082/169] move mark primary to worker1() --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index c2b0eed..97ec661 100644 --- a/bwamem.c +++ b/bwamem.c @@ -608,7 +608,6 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kstring_t str; str.l = str.m = 0; str.s = 0; if (a->n > 0) { - mem_mark_primary_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() for (k = 0; k < a->n; ++k) { bwahit_t h; if (a->a[k].secondary >= 0) continue; @@ -658,6 +657,7 @@ static void *worker1(void *data) for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); } return 0; } From 604e3d8da10d96ccb5495634d9ea03999d58e7fa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 16:15:26 -0500 Subject: [PATCH 083/169] code backup; to upgrade ksw.{c,h} --- bwamem.c | 4 +--- bwamem.h | 2 ++ bwamem_pair.c | 29 ++++++++++++++++++----------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/bwamem.c b/bwamem.c index 97ec661..a639109 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,8 +13,6 @@ #include "kvec.h" #include "ksort.h" -#define MAPQ_COEF 40. - int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging void mem_fill_scmat(int a, int b, int8_t mat[25]) @@ -583,7 +581,7 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) double identity; sub = a->csub > sub? a->csub : sub; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; - mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; + mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); diff --git a/bwamem.h b/bwamem.h index 3ac15d0..ebfb8cd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -5,6 +5,8 @@ #include "bntseq.h" #include "utils.h" +#define MEM_MAPQ_COEF 40.0 + struct __smem_i; typedef struct __smem_i smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 6b44f78..9129663 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,11 +97,12 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { + extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; pair64_t o, subo; // score<<32 | raw_score<<8 | hash - int r, i, y[4]; // y[] keeps the last hit + int r, i, k, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { @@ -117,7 +118,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con o.x = o.y = subo.x = subo.y = 0; for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction - int dir = r<<1 | (v.a[i].y>>1&1), which, k; + int dir = r<<1 | (v.a[i].y>>1&1), which; if (pes[dir].failed) continue; // invalid orientation which = r<<1 | ((v.a[i].y&1)^1); if (y[which] < 0) continue; // no previous hits @@ -133,7 +134,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con raw_score = (v.a[i].y>>32) + (v.a[i].y>>32); if (raw_score + 20 * opt->a < (subo.x>>8&0xffffff)) continue; // skip the following if the score is too small ns = (dist - pes[dir].avg) / pes[dir].std; - score = (int)(23. * raw_score / (opt->a + opt->b) - 4.343 * log(.5 * erfc(fabs(ns) * M_SQRT1_2)) + .499); + score = (int)(raw_score - 4.343 / 23. * (opt->a + opt->b) * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair)&0xff); if (x > o.x) subo = o, o.x = x, o.y = pair; @@ -142,7 +143,13 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con } y[v.a[i].y&3] = i; } + if (o.x > 0) { + i = o.y >> 32; k = o.y << 32 >> 32; + mem_alnreg2hit(&a[v.a[i].y&1].a[v.a[i].y<<32>>34], &h[v.a[i].y&1]); + mem_alnreg2hit(&a[v.a[k].y&1].a[v.a[k].y<<32>>34], &h[v.a[k].y&1]); + } free(v.a); + return o.x == 0? -1 : 0; } void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) @@ -150,11 +157,11 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c kstring_t str; bwahit_t h[2]; str.l = str.m = 0; str.s = 0; - mem_pair(opt, bns, pac, pes, s, a, h); - /* - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); - s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); - s[1].sam = str.s; - */ + if (mem_pair(opt, bns, pac, pes, s, a, h) == 0) { // successful + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); + s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); + s[1].sam = str.s; + } else { + } } From 28a7d501f2f911866ad925f0636d1e7f07b6667f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 16:35:05 -0500 Subject: [PATCH 084/169] updated to the latest ksw; NOT TESTED YET!!! --- bwtsw2_pair.c | 38 +++------ ksw.c | 209 ++++++++++++++++++++++++++++++-------------------- ksw.h | 81 +++++++++++-------- 3 files changed, 184 insertions(+), 144 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 8a8287b..85ba1eb 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -127,35 +127,18 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b seq[i] = nst_nt4_table[(int)mseq[i]]; } #ifndef _NO_SSE2 - { - ksw_query_t *q; - ksw_aux_t aux[2]; - // forward Smith-Waterman - aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; - q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); - ksw_sse2(q, end - beg, ref, &aux[0]); - free(q); - if (aux[0].score < opt->t) { - free(seq); - return; - } - ++aux[0].qe; ++aux[0].te; - // reverse Smith-Waterman - seq_reverse(aux[0].qe, seq, 0); - seq_reverse(aux[0].te, ref, 0); - q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); - ksw_sse2(q, aux[0].te, ref, &aux[1]); - free(q); - ++aux[1].qe; ++aux[1].te; - // write output - a->G = aux[0].score; - a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + { // FIXME!!! The following block has not been tested since the update of the ksw library + int flag = KSW_XSUBO | KSW_XSTOP | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0); + kswr_t aln; + aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); + a->G = aln.score; + a->G2 = aln.score2; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + (aux[0].te - aux[1].te); - a->len = aux[1].te; - a->beg = aux[0].qe - aux[1].qe; - a->end = aux[0].qe; + a->k = beg + aln.tb; + a->len = aln.te - aln.tb; + a->beg = aln.qb; + a->end = aln.qe; } #else { @@ -168,6 +151,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + path[0].i - 1; a->len = path[1].i - path[0].i + 1; a->beg = path[0].j - 1; diff --git a/ksw.c b/ksw.c index 08cdf56..4599c6b 100644 --- a/ksw.c +++ b/ksw.c @@ -25,14 +25,8 @@ #include #include -#include "ksw.h" - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef _NO_SSE2 #include +#include "ksw.h" #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) @@ -42,26 +36,35 @@ #define UNLIKELY(x) (x) #endif -/*************** - *** SSE2 SW *** - ***************/ +const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; -struct _ksw_query_t { +struct _kswq_t { int qlen, slen; uint8_t shift, mdiff, max, size; __m128i *qp, *H0, *H1, *E, *Hmax; }; -ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { - ksw_query_t *q; + kswq_t *q; int slen, a, tmp, p; size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i slen = (qlen + p - 1) / p; // segmented length - q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory - q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory + q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; q->E = q->H1 + slen; @@ -100,11 +103,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in return q; } -int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) { - int slen, i, m_b, n_b, te = -1, gmax = 0; + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + kswr_t r; #define __max_16(ret, xx) do { \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ @@ -115,10 +119,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / } while (0) // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi8(a->gapo + a->gape); - gape = _mm_set1_epi8(a->gape); + gapoe = _mm_set1_epi8(_gapo + _gape); + gape = _mm_set1_epi8(_gape); shift = _mm_set1_epi8(q->shift); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; @@ -174,11 +181,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / end_loop16: //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); __max_16(imax, max); // imax is the maximum number in max - if (imax >= a->T) { // write the b array; this condition adds branching unfornately + if (imax >= minsc) { // write the b array; this condition adds branching unfornately if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = realloc(b, 8 * m_b); + b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -187,34 +194,38 @@ end_loop16: gmax = imax; te = i; // te is the end position on the target for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax + q->shift >= 255) break; + if (gmax + q->shift >= 255 || gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; // swap H0 and H1 } - a->score = gmax; a->te = te; - { // get a->qe, the end of query match; find the 2nd best score + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score int max = -1, low, high, qlen = slen * 16; uint8_t *t = (uint8_t*)Hmax; - for (i = 0, a->qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; //printf("%d,%d\n", max, gmax); - i = (a->score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0, a->score2 = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) - a->score2 = b[i]>>32, a->te2 = e; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } } } free(b); - return a->score + q->shift >= 255? 255 : a->score; + return r; } -int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) { - int slen, i, m_b, n_b, te = -1, gmax = 0; + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + kswr_t r; #define __max_8(ret, xx) do { \ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ @@ -224,10 +235,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // } while (0) // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi16(a->gapo + a->gape); - gape = _mm_set1_epi16(a->gape); + gapoe = _mm_set1_epi16(_gapo + _gape); + gape = _mm_set1_epi16(_gape); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { @@ -269,11 +283,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // } end_loop8: __max_8(imax, max); - if (imax >= a->T) { + if (imax >= minsc) { if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = realloc(b, 8 * m_b); + b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -282,34 +296,60 @@ end_loop8: gmax = imax; te = i; for (j = 0; LIKELY(j < slen); ++j) _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; } - a->score = gmax; a->te = te; + r.score = gmax; r.te = te; { int max = -1, low, high, qlen = slen * 8; uint16_t *t = (uint16_t*)Hmax; - for (i = 0, a->qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; - i = (a->score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0, a->score2 = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) - a->score2 = b[i]>>32, a->te2 = e; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } } } free(b); - return a->score; + return r; } -int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) +static void revseq(int l, uint8_t *s) { - if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); - else return ksw_sse2_8(q, tlen, target, a); + int i, t; + for (i = 0; i < l>>1; ++i) + t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; } -#endif // _NO_SSE2 +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + int size; + kswq_t *q; + kswr_t r, rr; + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); + + q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + if (qry && *qry == 0) *qry = q; + func = q->size == 2? ksw_i16 : ksw_u8; + size = q->size; + r = func(q, tlen, target, gapo, gape, xtra); + if (qry == 0) free(q); + if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; + revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end + q = ksw_qinit(size, r.qe + 1, query, m, mat); + rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); + revseq(r.qe + 1, query); revseq(r.te + 1, target); + free(q); + if (r.score == rr.score) + r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; + return r; +} /******************** *** SW extension *** @@ -494,7 +534,7 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, * Main function (not compiled by default) * *******************************************/ -#if defined(_KSW_MAIN) && !defined(_NO_SSE2) +#ifdef _KSW_MAIN #include #include @@ -523,30 +563,33 @@ unsigned char seq_nt4_table[256] = { int main(int argc, char *argv[]) { - int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; int8_t mat[25]; - ksw_aux_t a; + int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; + uint8_t *rseq = 0; gzFile fpt, fpq; kseq_t *kst, *ksq; + // parse command line - a.gapo = 5; a.gape = 2; a.T = 10; - while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { + while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { switch (c) { case 'a': sa = atoi(optarg); break; case 'b': sb = atoi(optarg); break; - case 'q': a.gapo = atoi(optarg); break; - case 'r': a.gape = atoi(optarg); break; - case 't': a.T = atoi(optarg); break; + case 'q': gapo = atoi(optarg); break; + case 'r': gape = atoi(optarg); break; + case 't': minsc = atoi(optarg); break; case 'f': forward_only = 1; break; - case 's': size = atoi(optarg); break; + case '1': xtra |= KSW_XBYTE; break; } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] \n", size, sa, sb, a.gapo, a.gape); + fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); return 1; } + if (minsc > 0xffff) minsc = 0xffff; + if (minsc > 0) xtra |= KSW_XSUBO | minsc; // initialize scoring matrix - for (i = k = 0; i < 5; ++i) { + for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? sa : -sb; mat[k++] = 0; // ambiguous base @@ -557,34 +600,34 @@ int main(int argc, char *argv[]) fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); // all-pair alignment while (kseq_read(ksq) > 0) { - ksw_query_t *q[2]; - for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; - q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + kswq_t *q[2] = {0, 0}; + kswr_t r; + for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; if (!forward_only) { // reverse - for (i = 0; i < ksq->seq.l/2; ++i) { - int t = ksq->seq.s[i]; - ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; - ksq->seq.s[ksq->seq.l-1-i] = t; + if ((int)ksq->seq.m > max_rseq) { + max_rseq = ksq->seq.m; + rseq = (uint8_t*)realloc(rseq, max_rseq); } - for (i = 0; i < ksq->seq.l; ++i) - ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; - q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); - } else q[1] = 0; + for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) + rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + } gzrewind(fpt); kseq_rewind(kst); while (kseq_read(kst) > 0) { - int s; - for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; - s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); - printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); - if (q[1]) { - s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); - printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); + if (r.score >= minsc) + printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); + if (rseq) { + r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); + if (r.score >= minsc) + printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); } } free(q[0]); free(q[1]); } + free(rseq); kseq_destroy(kst); gzclose(fpt); kseq_destroy(ksq); gzclose(fpq); return 0; } -#endif // _KSW_MAIN +#endif diff --git a/ksw.h b/ksw.h index c7eaabb..5162dc0 100644 --- a/ksw.h +++ b/ksw.h @@ -3,51 +3,64 @@ #include -struct _ksw_query_t; -typedef struct _ksw_query_t ksw_query_t; +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 + +struct _kswq_t; +typedef struct _kswq_t kswq_t; typedef struct { - // input - unsigned gapo, gape; // the first gap costs gapo+gape - unsigned T; // threshold - // output - int score, te, qe, score2, te2; -} ksw_aux_t; + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; #ifdef __cplusplus extern "C" { #endif /** - * Initialize the query data structure + * Aligning two sequences * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet - * @param mat Scoring matrix in a one-dimension array + * @param qlen length of the query sequence (typically Date: Tue, 12 Feb 2013 17:48:46 -0500 Subject: [PATCH 085/169] bugfix: bug in the new ksw.c On my test data, one alignment is different, caused by polyA --- bwtsw2_pair.c | 12 +++++++++--- ksw.c | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 85ba1eb..cf29087 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -128,17 +128,23 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b } #ifndef _NO_SSE2 { // FIXME!!! The following block has not been tested since the update of the ksw library - int flag = KSW_XSUBO | KSW_XSTOP | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0); + int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; kswr_t aln; aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); a->G = aln.score; a->G2 = aln.score2; + if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + aln.tb; - a->len = aln.te - aln.tb; + a->len = aln.te - aln.tb + 1; a->beg = aln.qb; - a->end = aln.qe; + a->end = aln.qe + 1; + /* + printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); + printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); + printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); + */ } #else { diff --git a/ksw.c b/ksw.c index 4599c6b..8d741a6 100644 --- a/ksw.c +++ b/ksw.c @@ -211,7 +211,7 @@ end_loop16: low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) r.score2 = b[i]>>32, r.te2 = e; } } @@ -311,7 +311,7 @@ end_loop8: low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) r.score2 = b[i]>>32, r.te2 = e; } } @@ -587,7 +587,7 @@ int main(int argc, char *argv[]) return 1; } if (minsc > 0xffff) minsc = 0xffff; - if (minsc > 0) xtra |= KSW_XSUBO | minsc; + xtra |= KSW_XSUBO | minsc; // initialize scoring matrix for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) From 87d619a21f67561ecd4efdde2dc6b648f4e6f0a6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 13 Feb 2013 23:16:16 -0500 Subject: [PATCH 086/169] minor code simplification --- bwamem_pair.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 9129663..9cb41c2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,7 +97,12 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +{ + int is_rev = a->rb >= l_pac? 1 : 0; +} + +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; @@ -108,8 +113,8 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons for (i = 0; i < a[r].n; ++i) { pair64_t key; mem_alnreg_t *e = &a[r].a[i]; - key.x = e->rb < bns->l_pac? e->rb : (bns->l_pac<<1) - 1 - e->rb; // forward position - key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= bns->l_pac)<<1 | r; + key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } @@ -157,7 +162,7 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c kstring_t str; bwahit_t h[2]; str.l = str.m = 0; str.s = 0; - if (mem_pair(opt, bns, pac, pes, s, a, h) == 0) { // successful + if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); From 688b524cdfcaa9c04783570ae29f441807cd4d07 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 13 Feb 2013 23:55:56 -0500 Subject: [PATCH 087/169] code backup; tired.. --- bwamem_pair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 9cb41c2..08af1d3 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,9 +97,16 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { - int is_rev = a->rb >= l_pac? 1 : 0; + int r; + rn = !!rn; // either 0 or 1 + for (r = 0; r < 4; ++r) { + int is_rev, is_larger; + if (pes[r].failed) continue; + is_rev = r>>1 == (r&1)? 0 : 1; // whether to reverse complement the mate + is_larger = r>>(!rn)&1; // whether the mate has larger coordinate + } } int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) From df1ff2b36e86c7fe1f3a170f7ffa37b243aeeac7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 14 Feb 2013 12:59:32 -0500 Subject: [PATCH 088/169] better and proper way to infer orinentation --- Makefile | 1 + bwamem.h | 9 +++++++++ bwamem_pair.c | 22 ++++++++++++++-------- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 2c060e9..e11a04d 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,7 @@ bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_main.o:bwtsw2.h bwamem.o:bwamem.h +bwamem_pair.o:bwamem.h fastmap.o:bwt.h bwamem.h clean: diff --git a/bwamem.h b/bwamem.h index ebfb8cd..1f9605d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -83,4 +83,13 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } #endif +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + #endif diff --git a/bwamem_pair.c b/bwamem_pair.c index 08af1d3..cc0e8f0 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "kstring.h" #include "bwamem.h" @@ -38,19 +39,15 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * memset(isize, 0, sizeof(kvec_t(int)) * 4); for (i = 0; i < n>>1; ++i) { int dir; - int64_t is, pos[2]; + int64_t is; mem_alnreg_v *r[2]; r[0] = (mem_alnreg_v*)®s[i<<1|0]; r[1] = (mem_alnreg_v*)®s[i<<1|1]; if (r[0]->n == 0 || r[1]->n == 0) continue; if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; - pos[0] = r[0]->a[0].rb < l_pac? r[0]->a[0].rb : (l_pac<<1) - 1 - r[0]->a[0].rb; // forward coordinate - pos[1] = r[1]->a[0].rb < l_pac? r[1]->a[0].rb : (l_pac<<1) - 1 - r[1]->a[0].rb; - if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); - else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); - is = abs(pos[0] - pos[1]); - if (is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); + dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); + if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. @@ -99,8 +96,17 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { - int r; + int i, r, skip[4]; rn = !!rn; // either 0 or 1 + for (r = 0; r < 4; ++r) + skip[r] = pes[r].failed? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; if (pes[r].failed) continue; From 5f8c6efbc3f4677371c5fd13ab837a98aa6aade0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 09:48:44 -0500 Subject: [PATCH 089/169] forbid x-bounary bns_get_seq(); code backup --- bntseq.c | 38 +++++++++++++++++--------------------- bwamem.c | 2 ++ bwamem_pair.c | 37 ++++++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/bntseq.c b/bntseq.c index 06d82a0..0286c19 100644 --- a/bntseq.c +++ b/bntseq.c @@ -322,29 +322,25 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) return nn; } -static inline void get_seq_core(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, uint8_t *seq) -{ - int64_t k, l = 0; - if (beg >= l_pac) { // reverse strand - int64_t beg_f = (l_pac<<1) - 1 - end; - int64_t end_f = (l_pac<<1) - 1 - beg; - for (k = end_f; k > beg_f; --k) - seq[l++] = 3 - _get_pac(pac, k); - } else { // forward strand - for (k = beg; k < end; ++k) - seq[l++] = _get_pac(pac, k); - } -} - uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) { - uint8_t *seq; + uint8_t *seq = 0; + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap if (end > l_pac<<1) end = l_pac<<1; - *len = end - beg; - seq = malloc(end - beg); - if (beg < l_pac && end > l_pac) { - get_seq_core(l_pac, pac, beg, l_pac, seq); - get_seq_core(l_pac, pac, l_pac, end, seq + (l_pac - beg)); - } else get_seq_core(l_pac, pac, beg, end, seq); + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + seq = malloc(end - beg); + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; } diff --git a/bwamem.c b/bwamem.c index a639109..320df8d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -395,6 +395,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int mem_alnreg_t best; memset(&best, 0, sizeof(mem_alnreg_t)); + memset(a, 0, sizeof(mem_alnreg_t)); // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -408,6 +409,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); + if (rlen != rmax[1] - rmax[0]) return; for (k = 0; k < c->n;) { s = &c->seeds[k]; diff --git a/bwamem_pair.c b/bwamem_pair.c index cc0e8f0..dd9b3cd 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,7 +97,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { int i, r, skip[4]; - rn = !!rn; // either 0 or 1 + rn = !!rn; // either 0 or 1; $rn is the read number of $a for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; for (i = 0; i < ma->n; ++i) { // check which orinentation has been found @@ -109,9 +109,28 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; - if (pes[r].failed) continue; - is_rev = r>>1 == (r&1)? 0 : 1; // whether to reverse complement the mate - is_larger = r>>(!rn)&1; // whether the mate has larger coordinate + uint8_t *seq, *rev = 0, *ref; + int64_t rb, re, len; + if (skip[r]) continue; + is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate + is_larger = r>>rn&1; // whether the mate has larger coordinate + if (is_rev) { + rev = malloc(l_ms); // this is the reverse complement of $ms + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? ms[i] : 4; + seq = rev; + } else seq = (uint8_t*)ms; + if (!is_rev) { + rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; + re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length + } else { + rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; + } + ref = bns_get_seq(l_pac, pac, rb, re, &len); + if (len == re - rb) { + } + if (rev == 0) free(rev); + free(ref); } } @@ -174,8 +193,16 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c { kstring_t str; bwahit_t h[2]; + mem_alnreg_t a0[2]; str.l = str.m = 0; str.s = 0; - if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful + // perform SW for the best alignment + a0[0].score = a0[1].score = -1; + if (a[0].n) a0[0] = a[0].a[0]; + if (a[1].n) a0[1] = a[1].a[0]; + if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 0, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); + if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 1, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + // pairing single-end hits + if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); From fe2236f6feca5b5230e6de75934371bc244434e5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 10:09:30 -0500 Subject: [PATCH 090/169] code backup --- bwamem_pair.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index dd9b3cd..3979341 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -6,6 +6,7 @@ #include "bwamem.h" #include "kvec.h" #include "utils.h" +#include "ksw.h" #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 @@ -126,8 +127,28 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; } + if (rb < 0) rb = 0; + if (re > l_pac) re = l_pac; ref = bns_get_seq(l_pac, pac, rb, re, &len); - if (len == re - rb) { + if (len == re - rb) { // no funny things happening + kswr_t aln; + mem_alnreg_t b; + int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; + aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); + memset(&b, 0, sizeof(mem_alnreg_t)); + b.qb = aln.qb; b.qe = aln.qe + 1; + b.rb = rb + aln.tb; + b.re = rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; } if (rev == 0) free(rev); free(ref); From 8ee464478aa5dd5f89c186b9824f28151a6a574d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 10:48:50 -0500 Subject: [PATCH 091/169] matesw working; for testing only --- bwamem_pair.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 3979341..05f0547 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -95,18 +95,19 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { int i, r, skip[4]; - rn = !!rn; // either 0 or 1; $rn is the read number of $a for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; +#if 0 for (i = 0; i < ma->n; ++i) { // check which orinentation has been found int64_t dist; r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); if (dist >= pes[r].low && dist <= pes[r].high) skip[r] = 1; } +#endif if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; @@ -114,10 +115,10 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m int64_t rb, re, len; if (skip[r]) continue; is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate - is_larger = r>>rn&1; // whether the mate has larger coordinate + is_larger = !(r>>1); // whether the mate has larger coordinate if (is_rev) { rev = malloc(l_ms); // this is the reverse complement of $ms - for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? ms[i] : 4; + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; seq = rev; } else seq = (uint8_t*)ms; if (!is_rev) { @@ -128,7 +129,7 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; } if (rb < 0) rb = 0; - if (re > l_pac) re = l_pac; + if (re > l_pac<<1) re = l_pac<<1; ref = bns_get_seq(l_pac, pac, rb, re, &len); if (len == re - rb) { // no funny things happening kswr_t aln; @@ -137,11 +138,17 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); b.qb = aln.qb; b.qe = aln.qe + 1; - b.rb = rb + aln.tb; - b.re = rb + aln.te + 1; + if (is_rev) { + b.rb = (l_pac<<1) - (rb + aln.te + 1); + b.re = (l_pac<<1) - (rb + aln.tb); + } else { + b.rb = rb + aln.tb; + b.re = rb + aln.te + 1; + } b.score = aln.score; b.csub = aln.score2; b.secondary = -1; + printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted for (i = 0; i < ma->n - 1; ++i) // find the insertion point @@ -220,8 +227,8 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c a0[0].score = a0[1].score = -1; if (a[0].n) a0[0] = a[0].a[0]; if (a[1].n) a0[1] = a[1].a[0]; - if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 0, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); - if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 1, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); + if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); // pairing single-end hits if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); From ea9fc7df48e9219613d3549884e9d597079cc6d4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 11:03:27 -0500 Subject: [PATCH 092/169] keep the number of SW performed --- bwamem.c | 6 ++++-- bwamem_pair.c | 29 +++++++++++++---------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/bwamem.c b/bwamem.c index 320df8d..ae4992f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -664,7 +664,7 @@ static void *worker1(void *data) static void *worker2(void *data) { - extern void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; int i; if (!w->opt->is_pe) { @@ -673,10 +673,12 @@ static void *worker2(void *data) free(w->regs[i].a); } } else { + int n = 0; for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); + n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } + fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n); } return 0; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 05f0547..df27ef1 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -95,20 +95,18 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { - int i, r, skip[4]; + int i, r, skip[4], n = 0; for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; -#if 0 for (i = 0; i < ma->n; ++i) { // check which orinentation has been found int64_t dist; r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); if (dist >= pes[r].low && dist <= pes[r].high) skip[r] = 1; } -#endif - if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; uint8_t *seq, *rev = 0, *ref; @@ -138,17 +136,12 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); b.qb = aln.qb; b.qe = aln.qe + 1; - if (is_rev) { - b.rb = (l_pac<<1) - (rb + aln.te + 1); - b.re = (l_pac<<1) - (rb + aln.tb); - } else { - b.rb = rb + aln.tb; - b.re = rb + aln.te + 1; - } + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; b.score = aln.score; b.csub = aln.score2; b.secondary = -1; - printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted for (i = 0; i < ma->n - 1; ++i) // find the insertion point @@ -156,10 +149,12 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m tmp = i; for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; ma->a[i] = b; + ++n; } if (rev == 0) free(rev); free(ref); } + return n; } int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) @@ -217,8 +212,9 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ return o.x == 0? -1 : 0; } -void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) { + int n = 0; kstring_t str; bwahit_t h[2]; mem_alnreg_t a0[2]; @@ -227,8 +223,8 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c a0[0].score = a0[1].score = -1; if (a[0].n) a0[0] = a[0].a[0]; if (a[1].n) a0[1] = a[1].a[0]; - if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); - if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + if (a0[0].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); + if (a0[1].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); // pairing single-end hits if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); @@ -237,4 +233,5 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c s[1].sam = str.s; } else { } + return n; } From f0a6285abad5010e1916f2109db73ac44371954a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 11:52:04 -0500 Subject: [PATCH 093/169] perform mate-SW for some suboptimal alignments --- bwamem_pair.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index df27ef1..fe6f697 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -135,20 +135,22 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); - b.qb = aln.qb; b.qe = aln.qe + 1; - b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; - b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; - b.score = aln.score; - b.csub = aln.score2; - b.secondary = -1; -// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); - kv_push(mem_alnreg_t, *ma, b); // make room for a new element - // move b s.t. ma is sorted - for (i = 0; i < ma->n - 1; ++i) // find the insertion point - if (ma->a[i].score < b.score) break; - tmp = i; - for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; - ma->a[i] = b; + if (aln.score >= opt->min_seed_len) { + b.qb = aln.qb; b.qe = aln.qe + 1; + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; + } ++n; } if (rev == 0) free(rev); @@ -214,17 +216,22 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) { - int n = 0; + int n = 0, i, j; kstring_t str; bwahit_t h[2]; - mem_alnreg_t a0[2]; + mem_alnreg_t b[2][2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment - a0[0].score = a0[1].score = -1; - if (a[0].n) a0[0] = a[0].a[0]; - if (a[1].n) a0[1] = a[1].a[0]; - if (a0[0].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); - if (a0[1].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + for (i = 0; i < 2; ++i) + for (j = 0; j < 2; ++j) b[i][j].score = -1; + for (i = 0; i < 2; ++i) { + for (j = 0; j < a[i].n && j < 2; ++j) b[i][j] = a[i].a[j]; + if (b[i][0].score > 0 && b[i][1].score > 0 && b[i][1].score < b[i][0].score * 0.8) + b[i][1].score = -1; + } + for (i = 0; i < 2; ++i) + for (j = 0; j < 2; ++j) + if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); // pairing single-end hits if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); From 66585b7982ea5820511ee1701234c50a94d98067 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 18 Feb 2013 16:33:06 -0500 Subject: [PATCH 094/169] code backup --- bwamem.c | 27 ++++++++++++++------------- bwamem.h | 8 ++++++-- bwamem_pair.c | 32 ++++++++++++++++++-------------- fastmap.c | 6 ++++-- 4 files changed, 42 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index ae4992f..397422f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -31,6 +31,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; + o->flag = 0; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 10000; @@ -41,7 +42,6 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; - o->is_pe = 0; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -598,11 +598,11 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) h->score = a->score; h->sub = a->sub > a->csub? a->sub : a->csub; h->qual = 0; // quality unset - h->flag = a->secondary? 0x100 : 0; // only the "secondary" bit is set + h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set h->mb = h->me = -2; // mate positions are unset } -void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag) { int k; kstring_t str; @@ -612,10 +612,11 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b bwahit_t h; if (a->a[k].secondary >= 0) continue; mem_alnreg2hit(&a->a[k], &h); + h.flag |= extra_flag; h.qual = approx_mapq_se(opt, &a->a[k]); - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->is_hard); + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP); s->sam = str.s; } @@ -657,25 +658,25 @@ static void *worker1(void *data) for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); } return 0; } static void *worker2(void *data) { - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; int i; - if (!w->opt->is_pe) { + if (!(w->opt->flag&MEM_F_PE)) { for (i = 0; i < w->n; i += w->step) { - mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0); free(w->regs[i].a); } } else { int n = 0; for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet - n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); + n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n); @@ -702,21 +703,21 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { worker1(w); - if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else worker1(w); - if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); #endif for (i = 0; i < n; ++i) { diff --git a/bwamem.h b/bwamem.h index 1f9605d..5fa49e4 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,13 +15,17 @@ typedef struct { int32_t qbeg, len; } mem_seed_t; +#define MEM_F_HARDCLIP 0x1 +#define MEM_F_PE 0x2 +#define MEM_F_NOPAIRING 0x4 + typedef struct { int a, b, q, r, w; + int flag; int split_width; int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; - int pe_dir, is_pe; - int is_hard; // if to use hard clip + int pe_dir; float mask_level, chain_drop_ratio; int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/bwamem_pair.c b/bwamem_pair.c index fe6f697..9d4d590 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -159,11 +159,11 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, uint64_t *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; - pair64_t o, subo; // score<<32 | raw_score<<8 | hash + pair64_t o, subo; // .x: score<<32 | raw_score<<8 | hash; .y: pair int r, i, k, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { // loop through read number @@ -198,7 +198,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ ns = (dist - pes[dir].avg) / pes[dir].std; score = (int)(raw_score - 4.343 / 23. * (opt->a + opt->b) * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; - x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair)&0xff); + x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair ^ id<<8)&0xff); if (x > o.x) subo = o, o.x = x, o.y = pair; else if (x > subo.x) subo.x = x, subo.y = pair; } @@ -207,19 +207,24 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } if (o.x > 0) { i = o.y >> 32; k = o.y << 32 >> 32; - mem_alnreg2hit(&a[v.a[i].y&1].a[v.a[i].y<<32>>34], &h[v.a[i].y&1]); - mem_alnreg2hit(&a[v.a[k].y&1].a[v.a[k].y<<32>>34], &h[v.a[k].y&1]); + z[v.a[i].y&1] = v.a[i].y<<32>>34; + z[v.a[k].y&1] = v.a[k].y<<32>>34; } free(v.a); - return o.x == 0? -1 : 0; + *sub = subo.x; + return o.x; } -int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { - int n = 0, i, j; + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); + extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); + int n = 0, i, j, z[2]; kstring_t str; bwahit_t h[2]; mem_alnreg_t b[2][2]; + uint64_t o, subo; + str.l = str.m = 0; str.s = 0; // perform SW for the best alignment for (i = 0; i < 2; ++i) @@ -233,12 +238,11 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co for (j = 0; j < 2; ++j) if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); // pairing single-end hits - if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); - s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); - s[1].sam = str.s; - } else { + o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); + if (0&&o) { // with proper pairing + } else { // no proper pairing + mem_mark_primary_se(opt, a[0].n, a[0].a); mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); + mem_mark_primary_se(opt, a[1].n, a[1].a); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); } return n; } diff --git a/fastmap.c b/fastmap.c index f2677eb..a2d7d94 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,8 +24,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:v:s:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; + else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); @@ -59,7 +61,7 @@ int main_mem(int argc, char *argv[]) if (optind + 2 < argc) { fp2 = gzopen(argv[optind + 2], "r"); ks2 = kseq_init(fp2); - opt->is_pe = 1; + opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { mem_process_seqs(opt, bwt, bns, pac, n, seqs); From 688872fb1bf1796102b3517d55ad48122b81e9ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 19 Feb 2013 00:50:39 -0500 Subject: [PATCH 095/169] code backup --- bwamem.c | 4 ++-- bwamem.h | 1 + bwamem_pair.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index 397422f..3d1b9c5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -577,7 +577,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons * Integrated interface * ************************/ -static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) { int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; @@ -613,7 +613,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (a->a[k].secondary >= 0) continue; mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; - h.qual = approx_mapq_se(opt, &a->a[k]); + h.qual = mem_approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP); diff --git a/bwamem.h b/bwamem.h index 5fa49e4..d6e9f01 100644 --- a/bwamem.h +++ b/bwamem.h @@ -6,6 +6,7 @@ #include "utils.h" #define MEM_MAPQ_COEF 40.0 +#define MEM_MAPQ_MAX 60 struct __smem_i; typedef struct __smem_i smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 9d4d590..92b8842 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -159,6 +159,12 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } +static inline double approx_match(const mem_opt_t *opt, const mem_alnreg_v *a) +{ + int l = a->qe - a->qb < a->re - a->rb? a->qe - a->qb : a->re - a->rb; + return l - (double)(l * opt->a - a->score) / (opt->a + opt->b); +} + uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, uint64_t *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); @@ -219,9 +225,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); + extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + int n = 0, i, j, z[2]; kstring_t str; - bwahit_t h[2]; mem_alnreg_t b[2][2]; uint64_t o, subo; @@ -237,12 +244,40 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co for (i = 0; i < 2; ++i) for (j = 0; j < 2; ++j) if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + mem_mark_primary_se(opt, a[0].n, a[0].a); + mem_mark_primary_se(opt, a[1].n, a[1].a); // pairing single-end hits o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); - if (0&&o) { // with proper pairing - } else { // no proper pairing - mem_mark_primary_se(opt, a[0].n, a[0].a); mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); - mem_mark_primary_se(opt, a[1].n, a[1].a); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); - } + if (o && !(opt->flag&MEM_F_NOPAIRING)) { // with proper pairing + int is_multi[2], q_se[2], q_pe, is_tandem[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < a[i].n; ++j) + if (a[i].a[j].secondary < 0) break; + is_multi[i] = j < a[i].n? 1 : 0; + } + if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + for (i = 0; i < 2; ++i) { + q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); + is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); + } + q_pe = (int)(MEM_MAPQ_COEF * (1. - (double)(subo>>32) / (o>>32)) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499); + // the following assumes no split hits + if (z[0] == 0 && z[1] == 0) { // the best hit + q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; + q_se[0] = is_tabdem[0]? q_se[0] : q_pe; + q_se[1] = is_tabdem[1]? q_se[1] : q_pe; + } else { + double m[2]; + m[0] = approx_match(opt, a[0].a[0]) + approx_match(opt, a[1].a[0]); + m[1] = approx_match(opt, a[0].a[z[0]]) + approx_match(opt, a[1].a[z[1]]); + } + } else goto no_pairing; + return n; + +no_pairing: + mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); + mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); return n; } From a7d574d125bd99bfb299bf7686a1306215c03ab3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 01:11:38 -0500 Subject: [PATCH 096/169] backup comments --- bwamem.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 3d1b9c5..0c9c7b9 100644 --- a/bwamem.c +++ b/bwamem.c @@ -26,11 +26,32 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]) for (j = 0; j < 5; ++j) mat[k++] = 0; } +/* Theory on probability and scoring *ungapped* alignment + * + * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution + * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate + * + * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x + * + * If the matching score is x and mismatch penalty is -y, we can compute error rate e: + * e = .75 * exp[-log(4) * y/x] + * + * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} + * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) + * + * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: + * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) + * + * + * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) + * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + */ + mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->flag = 0; o->min_seed_len = 19; o->split_width = 10; From 5626fe29b7b07cf9efc4ee625d75ceca8f319de3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 19:11:44 -0500 Subject: [PATCH 097/169] Well, at least output sth --- bwamem.c | 3 +++ bwamem.h | 1 + bwamem_pair.c | 52 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0c9c7b9..f9415b2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -45,6 +45,8 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]) * * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + * + * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) */ mem_opt_t *mem_opt_init() @@ -63,6 +65,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; + o->pen_unpaired = 50; mem_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/bwamem.h b/bwamem.h index d6e9f01..43a5401 100644 --- a/bwamem.h +++ b/bwamem.h @@ -28,6 +28,7 @@ typedef struct { int n_threads, chunk_size; int pe_dir; float mask_level, chain_drop_ratio; + int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; diff --git a/bwamem_pair.c b/bwamem_pair.c index 92b8842..dc46f44 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -159,13 +159,13 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -static inline double approx_match(const mem_opt_t *opt, const mem_alnreg_v *a) +static inline double aln_q(const mem_opt_t *opt, const mem_alnreg_t *a) { int l = a->qe - a->qb < a->re - a->rb? a->qe - a->qb : a->re - a->rb; - return l - (double)(l * opt->a - a->score) / (opt->a + opt->b); + return (int)(6.02 * (l - (double)a->score / opt->a) + .499); } -uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, uint64_t *sub, int z[2]) +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; @@ -177,7 +177,7 @@ uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position - key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + key.y = (uint64_t)aln_q(opt, e) << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } @@ -192,19 +192,17 @@ uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const if (y[which] < 0) continue; // no previous hits for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) int64_t dist; - int raw_score, score; + int q; double ns; uint64_t x, pair; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; - raw_score = (v.a[i].y>>32) + (v.a[i].y>>32); - if (raw_score + 20 * opt->a < (subo.x>>8&0xffffff)) continue; // skip the following if the score is too small ns = (dist - pes[dir].avg) / pes[dir].std; - score = (int)(raw_score - 4.343 / 23. * (opt->a + opt->b) * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); + q = (int)((v.a[i].y>>32) + (v.a[i].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; - x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair ^ id<<8)&0xff); + x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); if (x > o.x) subo = o, o.x = x, o.y = pair; else if (x > subo.x) subo.x = x, subo.y = pair; } @@ -217,8 +215,8 @@ uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const z[v.a[k].y&1] = v.a[k].y<<32>>34; } free(v.a); - *sub = subo.x; - return o.x; + *sub = subo.x>>32; + return o.x>>32; } int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) @@ -226,11 +224,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); + extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); - int n = 0, i, j, z[2]; + int n = 0, i, j, z[2], o, subo; kstring_t str; mem_alnreg_t b[2][2]; - uint64_t o, subo; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment @@ -249,7 +248,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // pairing single-end hits o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); if (o && !(opt->flag&MEM_F_NOPAIRING)) { // with proper pairing - int is_multi[2], q_se[2], q_pe, is_tandem[2]; + int is_multi[2], q_se[2], q_pe, is_tandem[2], extra_flag = 1, un; + bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) @@ -262,17 +262,29 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); } - q_pe = (int)(MEM_MAPQ_COEF * (1. - (double)(subo>>32) / (o>>32)) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499); + un = aln_q(opt, &a[0].a[0]) + aln_q(opt, &a[1].a[0]) + opt->pen_unpaired; + subo = subo < un? subo : un; + q_pe = subo - o; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; - q_se[0] = is_tabdem[0]? q_se[0] : q_pe; - q_se[1] = is_tabdem[1]? q_se[1] : q_pe; + q_se[0] = is_tandem[0]? q_se[0] : q_pe; + q_se[1] = is_tandem[1]? q_se[1] : q_pe; + extra_flag |= 2; } else { - double m[2]; - m[0] = approx_match(opt, a[0].a[0]) + approx_match(opt, a[1].a[0]); - m[1] = approx_match(opt, a[0].a[z[0]]) + approx_match(opt, a[1].a[z[1]]); + if (o > un) { // then move the pair + q_se[0] = z[0] == 0? q_se[0] : 0; + q_se[1] = z[1] == 0? q_se[1] : 0; + if (q_se[0] == 0) q_se[0] = q_se[1]; + if (q_se[1] == 0) q_se[1] = q_se[0]; + } else { // the unpaired alignment is much better + z[0] = z[1] = 0; + } } + mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; + mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP); s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP); s[1].sam = str.s; } else goto no_pairing; return n; From ea8f4f4d34b05be0c3a821c9eede57fae39c5477 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 20:26:57 -0500 Subject: [PATCH 098/169] clean bill from valgrind --- bwamem.c | 5 ++++- bwamem_pair.c | 2 +- ksw.c | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index f9415b2..f0e1a29 100644 --- a/bwamem.c +++ b/bwamem.c @@ -363,7 +363,10 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a) a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits - if (a[i].qe > a[i].qb) a[m++] = a[i]; + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } return m; } diff --git a/bwamem_pair.c b/bwamem_pair.c index dc46f44..092eee2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -153,7 +153,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me } ++n; } - if (rev == 0) free(rev); + if (rev) free(rev); free(ref); } return n; diff --git a/ksw.c b/ksw.c index 8d741a6..742fec9 100644 --- a/ksw.c +++ b/ksw.c @@ -447,7 +447,7 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { if (*n_cigar == *m_cigar) { *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; - cigar = realloc(cigar, (*m_cigar) << 4); + cigar = realloc(cigar, (*m_cigar) << 2); } cigar[(*n_cigar)++] = len<<4 | op; } else cigar[(*n_cigar)-1] += len<<4; @@ -520,8 +520,8 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; } - if (i >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); - if (k >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); + if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; *n_cigar_ = n_cigar, *cigar_ = cigar; From 41624fb347fcf6dfeb850b920e79dfa4a5781871 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 20:43:22 -0500 Subject: [PATCH 099/169] bugfix: choosing the worse instead of the best --- bwamem_pair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 092eee2..1302414 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -203,8 +203,8 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ q = (int)((v.a[i].y>>32) + (v.a[i].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); - if (x > o.x) subo = o, o.x = x, o.y = pair; - else if (x > subo.x) subo.x = x, subo.y = pair; + if (x < o.x) subo = o, o.x = x, o.y = pair; + else if (x < subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; From a9cae8c9af8a3e8b9983f146b14b7672a832f463 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 10:39:17 -0500 Subject: [PATCH 100/169] minor changes --- bwamem_pair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 1302414..7dc67fe 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -268,13 +268,16 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; + if (q_pe > 60) q_pe = 60; q_se[0] = is_tandem[0]? q_se[0] : q_pe; q_se[1] = is_tandem[1]? q_se[1] : q_pe; extra_flag |= 2; } else { if (o > un) { // then move the pair - q_se[0] = z[0] == 0? q_se[0] : 0; - q_se[1] = z[1] == 0? q_se[1] : 0; + int tmp[2]; + tmp[0] = q_se[0]; tmp[1] = q_se[1]; + q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; + q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; if (q_se[0] == 0) q_se[0] = q_se[1]; if (q_se[1] == 0) q_se[1] = q_se[0]; } else { // the unpaired alignment is much better From 84a328764a4f48a9b8686353c947e4e9edabc41c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 11:42:30 -0500 Subject: [PATCH 101/169] bugfix: mis-chaining caused by integer overflow I really need to rewrite kbtree some time. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index f0e1a29..2a9de82 100644 --- a/bwamem.c +++ b/bwamem.c @@ -159,7 +159,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) #include "kbtree.h" -#define chain_cmp(a, b) ((a).pos - (b).pos) +#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) KBTREE_INIT(chn, mem_chain_t, chain_cmp) static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) From f8829318cf4dee5d05b538bac58f66562e25f3a2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 12:25:20 -0500 Subject: [PATCH 102/169] weakened the chain filter --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 2a9de82..8122f70 100644 --- a/bwamem.c +++ b/bwamem.c @@ -316,7 +316,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].p2 == 0) a[j].p2 = a[i].p; - if (a[i].w < a[j].w * opt->chain_drop_ratio) + if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; } } From 54da54ffd4aaf39f88c6051025561c7cf3d44b76 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 12:52:00 -0500 Subject: [PATCH 103/169] extend more seeds (and thus slower...) --- bwamem.c | 4 +++- bwamem.h | 4 +++- fastmap.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 8122f70..9bb5ad0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -62,6 +62,7 @@ mem_opt_t *mem_opt_init() o->max_ins = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; + o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; @@ -186,7 +187,8 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; - while ((a = smem_next(itr, opt->min_seed_len<<1, opt->split_width)) != 0) { // to find all SMEM and some internal MEM + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; diff --git a/bwamem.h b/bwamem.h index 43a5401..6b191ae 100644 --- a/bwamem.h +++ b/bwamem.h @@ -27,7 +27,9 @@ typedef struct { int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir; - float mask_level, chain_drop_ratio; + float mask_level; + float chain_drop_ratio; + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/fastmap.c b/fastmap.c index a2d7d94..91a4ecb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,12 +24,13 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); + else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { @@ -38,6 +39,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); From cfbc4c89e32a74a47cd25c695058beb31ed517f4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 14:34:10 -0500 Subject: [PATCH 104/169] perform extension when there are, say, 20bp tandem --- bwamem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9bb5ad0..ec6aeff 100644 --- a/bwamem.c +++ b/bwamem.c @@ -463,15 +463,14 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; + if (a->score >= best.score) csub = best.score, best = *a; if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); - // check how many seeds have been covered + // jump to the next seed that: 1) has no overlap with the previous seed; 2) is not fully contained in the alignment for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; - if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) - break; + if ((t-1)->rbeg + (t-1)->len >= t->rbeg || (t-1)->qbeg + (t-1)->len >= t->qbeg) break; + if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; } - if (a->score >= best.score) csub = best.score, best = *a; - if (i >= c->n) break; // all seeds are included; no need to proceed k = i; } if (a->score < best.score) *a = best; From a578688fa80212b800f6f82842269095dba22f4e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 14:58:51 -0500 Subject: [PATCH 105/169] generate multiple alignments from one chain --- bwamem.c | 46 ++++++++++++++++++++-------------------------- bwamem.h | 2 +- kvec.h | 12 ++++++------ 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/bwamem.c b/bwamem.c index ec6aeff..2df9c53 100644 --- a/bwamem.c +++ b/bwamem.c @@ -415,16 +415,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, k, csub = 0; + int i, k; int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; const mem_seed_t *s; uint8_t *rseq = 0; - mem_alnreg_t best; - memset(&best, 0, sizeof(mem_alnreg_t)); - memset(a, 0, sizeof(mem_alnreg_t)); + av->n = 0; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -441,6 +439,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (rlen != rmax[1] - rmax[0]) return; for (k = 0; k < c->n;) { + mem_alnreg_t *a; + a = kv_pushp(mem_alnreg_t, *av); s = &c->seeds[k]; memset(a, 0, sizeof(mem_alnreg_t)); if (s->qbeg) { // left extension @@ -463,9 +463,14 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (a->score >= best.score) csub = best.score, best = *a; if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); - // jump to the next seed that: 1) has no overlap with the previous seed; 2) is not fully contained in the alignment + // compute seedcov + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained + a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + // jump to the next seed that: 1) has no overlap with the previous seed, or 2) is not fully contained in the alignment for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; if ((t-1)->rbeg + (t-1)->len >= t->rbeg || (t-1)->qbeg + (t-1)->len >= t->qbeg) break; @@ -473,18 +478,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } k = i; } - if (a->score < best.score) *a = best; - a->csub = csub; free(rseq); - - // compute seedcov - if (c->n > 1) { - for (i = 0, a->seedcov = 0; i < c->n; ++i) { - s = &c->seeds[i]; - if (s->qbeg >= a->qb && s->qbeg + s->len <= a->qe && s->rbeg >= a->rb && s->rbeg + s->len <= a->re) // seed fully contained - a->seedcov += s->len; // this is not very accurate, but for approx. mapQ, this is good enough - } - } else a->seedcov = c->seeds[0].len; } /***************************** @@ -650,21 +644,23 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) { - int i; + int i, j; mem_chain_v chn; - mem_alnreg_v regs; + mem_alnreg_v regs, tmp; for (i = 0; i < s->l_seq; ++i) s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (mem_verbose >= 4) mem_print_chain(bns, &chn); - regs.n = regs.m = chn.n; - regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); + kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], ®s.a[i]); + mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); + for (j = 0; j < tmp.n; ++j) + kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); } free(chn.a); + regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; } @@ -683,10 +679,8 @@ static void *worker1(void *data) { worker_t *w = (worker_t*)data; int i; - for (i = w->start; i < w->n; i += w->step) { + for (i = w->start; i < w->n; i += w->step) w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); - w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); - } return 0; } diff --git a/bwamem.h b/bwamem.h index 6b191ae..f20663e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -80,7 +80,7 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *a); uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); diff --git a/kvec.h b/kvec.h index 57204d6..9c9ca6e 100644 --- a/kvec.h +++ b/kvec.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Attractive Chaos + Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -76,15 +76,15 @@ int main() { (v).a[(v).n++] = (x); \ } while (0) -#define kv_pushp(type, v) (((v).n == (v).m)? \ +#define kv_pushp(type, v) ((((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), ((v).a + ((v).n++)) + : 0), &(v).a[(v).n++]) -#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ +#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) \ - : 0), (v).a[(i)] + : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ + : 0), (v).a[(i)]) #endif From d4cf6d97a66c407d060735282cb884d7571ea6be Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 15:04:31 -0500 Subject: [PATCH 106/169] bugfix: memory leak --- bwamem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2df9c53..412235a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -470,10 +470,10 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } - // jump to the next seed that: 1) has no overlap with the previous seed, or 2) is not fully contained in the alignment + // jump to the next seed that: 1) has no >7bp overlap with the previous seed, or 2) is not fully contained in the alignment for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; - if ((t-1)->rbeg + (t-1)->len >= t->rbeg || (t-1)->qbeg + (t-1)->len >= t->qbeg) break; + if ((t-1)->rbeg + (t-1)->len >= t->rbeg + 7 || (t-1)->qbeg + (t-1)->len >= t->qbeg + 7) break; if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; } k = i; @@ -659,7 +659,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); } - free(chn.a); + free(chn.a); free(tmp.a); regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; } From 58e4cc207fefe85ec8ac32f86f2d75885563b1f3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 21:26:01 -0500 Subject: [PATCH 107/169] bugfix: 1) fill seedcov; 2) pairing not working --- bwamem_pair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 7dc67fe..3db6882 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -142,6 +142,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me b.score = aln.score; b.csub = aln.score2; b.secondary = -1; + b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; // printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted @@ -183,7 +184,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = o.y = subo.x = subo.y = 0; + o.x = subo.x = (uint64_t)-1; o.y = subo.y = 0; for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; @@ -245,9 +246,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); + if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); - if (o && !(opt->flag&MEM_F_NOPAIRING)) { // with proper pairing + if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { int is_multi[2], q_se[2], q_pe, is_tandem[2], extra_flag = 1, un; bwahit_t h[2]; // check if an end has multiple hits even after mate-SW From 81fe6f8e382bf038dd402673c8a6ec5746b77f9d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 10:57:07 -0500 Subject: [PATCH 108/169] bugfix: a typo leading to wrong pairing --- bwamem_pair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 3db6882..cab7c2f 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -201,7 +201,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[i].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); if (x < o.x) subo = o, o.x = x, o.y = pair; From d5820177c63497aae8f1f83ffc0230924de47f85 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 11:02:14 -0500 Subject: [PATCH 109/169] bugfix: wrong mate-sw qry coor for rev --- bwamem_pair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index cab7c2f..d926857 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -136,7 +136,8 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len) { - b.qb = aln.qb; b.qe = aln.qe + 1; + b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; + b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; b.score = aln.score; From dfc63acc11dd86da41523dd190816260889fe396 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 11:06:36 -0500 Subject: [PATCH 110/169] bugfix: another ">" vs. "<" bug That hurts, as I am going to reverse all these again! --- bwamem_pair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index d926857..6b9c490 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -275,7 +275,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = is_tandem[1]? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o > un) { // then move the pair + if (o < un) { // then move the pair int tmp[2]; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; From ed08d08f364be17dde6c6a7cdda264181e825a07 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 11:17:31 -0500 Subject: [PATCH 111/169] fixed bugs caused by interger overflow --- bwamem_pair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 6b9c490..7340b05 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -185,7 +185,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = (uint64_t)-1; o.y = subo.y = 0; + o.x = subo.x = o.x = subo.x = 0x7fffffffULL<<32; o.y = subo.y = 0; for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; @@ -267,6 +267,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co un = aln_q(opt, &a[0].a[0]) + aln_q(opt, &a[1].a[0]) + opt->pen_unpaired; subo = subo < un? subo : un; q_pe = subo - o; + if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; @@ -282,6 +283,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; if (q_se[0] == 0) q_se[0] = q_se[1]; if (q_se[1] == 0) q_se[1] = q_se[0]; + a[0].a[z[0]].secondary = a[1].a[z[1]].secondary = -2; } else { // the unpaired alignment is much better z[0] = z[1] = 0; } From c5ce72f5936828513c6f87f04723dfe5a22c56c4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:10:20 -0500 Subject: [PATCH 112/169] scoring pairs by score, not by errors This is important for bwa-mem which does local alignment. A short exact match is worse than a long inexact match. Also fixed a bug in approximating mapping quality. --- bwamem.c | 2 +- bwamem_pair.c | 35 +++++++++++++++++------------------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index 412235a..431590f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -66,7 +66,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; - o->pen_unpaired = 50; + o->pen_unpaired = 9; mem_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 7340b05..9fa3505 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -161,12 +161,6 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -static inline double aln_q(const mem_opt_t *opt, const mem_alnreg_t *a) -{ - int l = a->qe - a->qb < a->re - a->rb? a->qe - a->qb : a->re - a->rb; - return (int)(6.02 * (l - (double)a->score / opt->a) + .499); -} - int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); @@ -179,13 +173,14 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position - key.y = (uint64_t)aln_q(opt, e) << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = o.x = subo.x = 0x7fffffffULL<<32; o.y = subo.y = 0; + o.x = subo.x = o.y = subo.y = 0; + //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; @@ -199,14 +194,17 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ uint64_t x, pair; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; + //printf("%d: %lld\n", k, dist); if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) + if (q < 0) q = 0; pair = (uint64_t)k<<32 | i; x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); - if (x < o.x) subo = o, o.x = x, o.y = pair; - else if (x < subo.x) subo.x = x, subo.y = pair; + //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); + if (x > o.x) subo = o, o.x = x, o.y = pair; + else if (x > subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; @@ -264,9 +262,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); } - un = aln_q(opt, &a[0].a[0]) + aln_q(opt, &a[1].a[0]) + opt->pen_unpaired; - subo = subo < un? subo : un; - q_pe = subo - o; + un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + if (un < 0) un = 0; + subo = subo > un? subo : un; + q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit @@ -276,14 +275,14 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = is_tandem[1]? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o < un) { // then move the pair + if (o > un) { // then move the pair int tmp[2]; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; - if (q_se[0] == 0) q_se[0] = q_se[1]; - if (q_se[1] == 0) q_se[1] = q_se[0]; - a[0].a[z[0]].secondary = a[1].a[z[1]].secondary = -2; + for (i = 0; i < 2; ++i) + if (a[i].a[z[i]].secondary >= 0) + a[i].a[z[i]].sub = a[i].a[a[i].a[z[i]].secondary].score, a[i].a[z[i]].secondary = -2; } else { // the unpaired alignment is much better z[0] = z[1] = 0; } From c0093264de2e8979f5b98318c57ae567ba3db0bf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:34:46 -0500 Subject: [PATCH 113/169] wrong logic: paired mapQ should 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit - q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; + q_pe = q_pe < q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; if (q_pe > 60) q_pe = 60; q_se[0] = is_tandem[0]? q_se[0] : q_pe; q_se[1] = is_tandem[1]? q_se[1] : q_pe; From 6a16edc15effa0ba813984066e5d2539c227e0a3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:47:26 -0500 Subject: [PATCH 114/169] tuning PE mapQ --- bwamem_pair.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 5dd5927..4c0b908 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -248,7 +248,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { - int is_multi[2], q_se[2], q_pe, is_tandem[2], extra_flag = 1, un; + int is_multi[2], q_se[2], q_pe, extra_flag = 1; bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -258,31 +258,26 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score // compute mapQ for the best SE hit - for (i = 0; i < 2; ++i) { + for (i = 0; i < 2; ++i) q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); - is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); - } - un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; - if (un < 0) un = 0; - subo = subo > un? subo : un; q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit - q_pe = q_pe < q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; - if (q_pe > 60) q_pe = 60; - q_se[0] = is_tandem[0]? q_se[0] : q_pe; - q_se[1] = is_tandem[1]? q_se[1] : q_pe; + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o > un) { // then move the pair + if (o > a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired) { // then move the pair int tmp[2]; + q_pe = q_pe > 7? q_pe - 7 : 0; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; for (i = 0; i < 2; ++i) if (a[i].a[z[i]].secondary >= 0) a[i].a[z[i]].sub = a[i].a[a[i].a[z[i]].secondary].score, a[i].a[z[i]].secondary = -2; + extra_flag |= 2; } else { // the unpaired alignment is much better z[0] = z[1] = 0; } From 38fc5c88223cd700e8b5fb3370baa2709acff88c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:54:42 -0500 Subject: [PATCH 115/169] reduce mapQ when a read is moved --- bwamem_pair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 4c0b908..f5ab495 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -268,9 +268,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o > a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired) { // then move the pair - int tmp[2]; - q_pe = q_pe > 7? q_pe - 7 : 0; + int un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + if (o > un) { // then move the pair + int tmp[2], q_un = (o - un) * 6; + q_pe = q_pe < q_un? q_pe : q_un; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; From 6c1a1137539ca806bebe147c9be0effdf5f4ba4c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 13:26:23 -0500 Subject: [PATCH 116/169] mate-SW for all high-scoring hits --- bwamem_pair.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index f5ab495..21d31ad 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -229,20 +229,19 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int n = 0, i, j, z[2], o, subo; kstring_t str; - mem_alnreg_t b[2][2]; + mem_alnreg_v b[2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment + kv_init(b[0]); kv_init(b[1]); for (i = 0; i < 2; ++i) - for (j = 0; j < 2; ++j) b[i][j].score = -1; - for (i = 0; i < 2; ++i) { - for (j = 0; j < a[i].n && j < 2; ++j) b[i][j] = a[i].a[j]; - if (b[i][0].score > 0 && b[i][1].score > 0 && b[i][1].score < b[i][0].score * 0.8) - b[i][1].score = -1; - } + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) - for (j = 0; j < 2; ++j) - if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + for (j = 0; j < b[i].n; ++j) + n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; From ba15b787cb59be38a82cc3314569e65aec2061cf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 14:47:57 -0500 Subject: [PATCH 117/169] rework PE mapq; don't know if better --- bwamem.c | 1 + bwamem_pair.c | 45 ++++++++++++++++++++++----------------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 431590f..931e685 100644 --- a/bwamem.c +++ b/bwamem.c @@ -604,6 +604,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; sub = a->csub > sub? a->csub : sub; + if (sub >= a->score) return 0; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; diff --git a/bwamem_pair.c b/bwamem_pair.c index 21d31ad..46d1dde 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -14,7 +14,6 @@ #define OUTLIER_BOUND 2.0 #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 -#define EXT_STDDEV 4.0 void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); @@ -247,7 +246,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { - int is_multi[2], q_se[2], q_pe, extra_flag = 1; + int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -257,30 +256,30 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score // compute mapQ for the best SE hit - for (i = 0; i < 2; ++i) - q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); - q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un? subo : score_un; + q_pe = (o - subo) * 6; if (q_pe > 60) q_pe = 60; // the following assumes no split hits - if (z[0] == 0 && z[1] == 0) { // the best hit - q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe; - q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe; - extra_flag |= 2; - } else { - int un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; - if (o > un) { // then move the pair - int tmp[2], q_un = (o - un) * 6; - q_pe = q_pe < q_un? q_pe : q_un; - tmp[0] = q_se[0]; tmp[1] = q_se[1]; - q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; - q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; - for (i = 0; i < 2; ++i) - if (a[i].a[z[i]].secondary >= 0) - a[i].a[z[i]].sub = a[i].a[a[i].a[z[i]].secondary].score, a[i].a[z[i]].secondary = -2; - extra_flag |= 2; - } else { // the unpaired alignment is much better - z[0] = z[1] = 0; + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t *c[2]; + c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); } + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6; + q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6; + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; From 17c123d65a4ac81752a86c049f578f18166ebf38 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 16:38:48 -0500 Subject: [PATCH 118/169] pring paired-end SAM --- bwamem.c | 92 ++++++++++++++++++++++++++++++--------------------- bwamem.h | 1 - bwamem_pair.c | 22 ++++++------ 3 files changed, 66 insertions(+), 49 deletions(-) diff --git a/bwamem.c b/bwamem.c index 931e685..b52d20b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -519,21 +519,37 @@ ret_gen_cigar: return cigar; } -void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard) + +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { - int score, n_cigar, is_rev = 0, nn, rid, mid, is_unmapped = 0; +#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) + int score, n_cigar, is_rev = 0, nn, rid, mid, copy_mate = 0; uint32_t *cigar = 0; int64_t pos; + bwahit_t ptmp, *p = &ptmp; - kputs(s->name, str); - if (p && p->rb >= 0 && p->re < bns->l_pac<<1) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + if (!p_) { // in this case, generate an unmapped alignment + memset(&ptmp, 0, sizeof(bwahit_t)); + ptmp.rb = ptmp.re = -1; + } else ptmp = *p_; + p->flag |= m? 1 : 0; // is paired in sequencing + p->flag |= !is_mapped(p)? 4 : 0; // is mapped + p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped + if (m && !is_mapped(p) && is_mapped(m)) { + p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq; + copy_mate = 1; + } + p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand + p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand + kputs(s->name, str); kputc('\t', str); + if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate + if (!copy_mate) { + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) + } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - p->flag |= is_rev? 16 : 0; // reverse - p->flag |= p->mb >= 0? 1 : 0; // paired in sequencing - p->flag |= n_cigar == 0? 8 : 0; // FIXME: check why this may happen (this has already happened) - kputc('\t', str); kputw(p->flag, str); kputc('\t', str); + kputw(p->flag, str); kputc('\t', str); kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); kputw(p->qual, str); kputc('\t', str); if (n_cigar) { @@ -546,29 +562,29 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } } else kputc('*', str); - if (p->mb >= 0 && p->mb < bns->l_pac<<1) { // then print mate pos and isize - pos = bns_depos(bns, p->mb < bns->l_pac? p->mb : p->me - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->me - p->mb, &mid); - kputc('\t', str); - if (mid == rid) kputc('=', str); - else kputs(bns->anns[mid].name, str); - kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); - kputc('\t', str); - if (mid != rid) { - int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; - int64_t p1 = p->mb < bns->l_pac? p->mb : (bns->l_pac<<1) - 1 - p->mb; - kputw(abs(p0 - p1), str); - } - kputc('\t', str); - } else kputsn("\t*\t0\t0\t", 7, str); - } else { // unaligned - is_unmapped = 1; - kputw(p? p->flag : 0, str); - kputs("\t*\t0\t0\t*\t*\t0\t0\t", str); + } else { // no coordinate + kputw(p->flag, str); + kputs("\t*\t0\t0\t*", str); + rid = -1; } - if (!is_rev) { // print SEQ and QUAL, the forward strand + if (m && is_mapped(m)) { // then print mate pos and isize + pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); + kputc('\t', str); + if (mid == rid) kputc('=', str); + else kputs(bns->anns[mid].name, str); + kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); + kputc('\t', str); + if (mid == rid) { + int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; + int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; + kputw(p0 - p1, str); + } else kputw(0, str); + kputc('\t', str); + } else kputsn("\t*\t0\t0\t", 7, str); + if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand int i, qb = 0, qe = s->l_seq; - if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; ks_resize(str, str->l + (qe - qb) + 1); for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; kputc('\t', str); @@ -579,7 +595,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } else kputc('*', str); } else { // the reverse strand int i, qb = 0, qe = s->l_seq; - if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; ks_resize(str, str->l + (qe - qb) + 1); for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; kputc('\t', str); @@ -589,10 +605,11 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons str->s[str->l] = 0; } else kputc('*', str); } - if (!is_unmapped && p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } - if (!is_unmapped && p->sub >= 0) { kputsn("\tss:i:", 6, str); kputw(p->sub, str); } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } kputc('\n', str); free(cigar); +#undef is_mapped } /************************ @@ -622,10 +639,9 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) h->sub = a->sub > a->csub? a->sub : a->csub; h->qual = 0; // quality unset h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set - h->mb = h->me = -2; // mate positions are unset } -void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag) +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m) { int k; kstring_t str; @@ -637,9 +653,9 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; h.qual = mem_approx_mapq_se(opt, &a->a[k]); - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP); + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); s->sam = str.s; } @@ -693,7 +709,7 @@ static void *worker2(void *data) if (!(w->opt->flag&MEM_F_PE)) { for (i = 0; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0); + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } } else { diff --git a/bwamem.h b/bwamem.h index f20663e..4319911 100644 --- a/bwamem.h +++ b/bwamem.h @@ -58,7 +58,6 @@ typedef struct { int qb, qe, flag, qual; // optional info int score, sub; - int64_t mb, me; // mb: mate start; -1 if single-end; -2 if mate unmapped } bwahit_t; typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; diff --git a/bwamem_pair.c b/bwamem_pair.c index 46d1dde..3dce119 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -15,8 +15,6 @@ #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 -void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); - static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; @@ -221,14 +219,15 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); - extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); + extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); + extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); int n = 0, i, j, z[2], o, subo; kstring_t str; mem_alnreg_v b[2]; + bwahit_t h[2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment @@ -245,9 +244,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { + if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; - bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) @@ -283,13 +281,17 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP); s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP); s[1].sam = str.s; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; } else goto no_pairing; return n; no_pairing: - mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); - mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); + for (i = 0; i < 2; ++i) { + if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]); + else h[i].rb = h[i].re = -1; + } + mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); + mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); return n; } From f122fad5625b133640f1f5e1844002744e537724 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:09:40 -0500 Subject: [PATCH 119/169] minor code clean up bwtio.c is merged to bwt.c --- Makefile | 7 +++-- bwase.c | 11 -------- bwt.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwtio.c | 77 ------------------------------------------------------- fastmap.c | 4 ++- utils.c | 4 +++ 6 files changed, 86 insertions(+), 93 deletions(-) delete mode 100644 bwtio.c diff --git a/Makefile b/Makefile index e11a04d..f14d906 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,9 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ - bwaseqio.o bwase.o kstring.o -AOBJS= QSufSort.o bwt_gen.o \ - is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o +AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + is.o bwtmisc.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwase.c b/bwase.c index 8fa79ac..1f36aaa 100644 --- a/bwase.c +++ b/bwase.c @@ -489,17 +489,6 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } -bntseq_t *bwa_open_nt(const char *prefix) -{ - bntseq_t *ntbns; - char *str; - str = (char*)calloc(strlen(prefix) + 10, 1); - strcat(strcpy(str, prefix), ".nt"); - ntbns = bns_restore(str); - free(str); - return ntbns; -} - void bwa_print_sam_SQ(const bntseq_t *bns) { int i; diff --git a/bwt.c b/bwt.c index 2903daa..7b37fe5 100644 --- a/bwt.c +++ b/bwt.c @@ -338,3 +338,79 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } + +/************************* + * Read/write BWT and SA * + *************************/ + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + fread(skipped, sizeof(bwtint_t), 4, fp); // skip + fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + fseek(fp, 0, SEEK_END); + bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); + fseek(fp, 0, SEEK_SET); + fread(&bwt->primary, sizeof(bwtint_t), 1, fp); + fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fread(bwt->bwt, 4, bwt->bwt_size, fp); + bwt->seq_len = bwt->L2[4]; + fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); +} diff --git a/bwtio.c b/bwtio.c deleted file mode 100644 index 7508609..0000000 --- a/bwtio.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include "bwt.h" -#include "utils.h" - -void bwt_dump_bwt(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(bwt->bwt, 4, bwt->bwt_size, fp); - fclose(fp); -} - -void bwt_dump_sa(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); - fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - fclose(fp); -} - -void bwt_restore_sa(const char *fn, bwt_t *bwt) -{ - char skipped[256]; - FILE *fp; - bwtint_t primary; - - fp = xopen(fn, "rb"); - fread(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); - fread(skipped, sizeof(bwtint_t), 4, fp); // skip - fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - fread(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); - - bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - bwt->sa[0] = -1; - - fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - fclose(fp); -} - -bwt_t *bwt_restore_bwt(const char *fn) -{ - bwt_t *bwt; - FILE *fp; - - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - fp = xopen(fn, "rb"); - fseek(fp, 0, SEEK_END); - bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; - bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); - fseek(fp, 0, SEEK_SET); - fread(&bwt->primary, sizeof(bwtint_t), 1, fp); - fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fread(bwt->bwt, 4, bwt->bwt_size, fp); - bwt->seq_len = bwt->L2[4]; - fclose(fp); - bwt_gen_cnt_table(bwt); - - return bwt; -} - -void bwt_destroy(bwt_t *bwt) -{ - if (bwt == 0) return; - free(bwt->sa); free(bwt->bwt); - free(bwt); -} diff --git a/fastmap.c b/fastmap.c index 91a4ecb..d52a315 100644 --- a/fastmap.c +++ b/fastmap.c @@ -17,7 +17,7 @@ int main_mem(int argc, char *argv[]) mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; - int c, n; + int c, n, l; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; uint8_t *pac = 0; @@ -57,6 +57,8 @@ int main_mem(int argc, char *argv[]) pac = calloc(bns->l_pac/4+1, 1); fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } + for (l = 0; l < bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); diff --git a/utils.c b/utils.c index 127c8fe..1cebaab 100644 --- a/utils.c +++ b/utils.c @@ -40,6 +40,10 @@ KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) +/******************** + * System utilities * + ********************/ + FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; From 545fb87feb5c4ff87b3e2c449f9571581abe89b8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:15:57 -0500 Subject: [PATCH 120/169] removed another part related to color-space --- bwtmisc.c | 51 --------------------------------------------------- main.c | 2 -- main.h | 1 - 3 files changed, 54 deletions(-) diff --git a/bwtmisc.c b/bwtmisc.c index c35d684..de96dc2 100644 --- a/bwtmisc.c +++ b/bwtmisc.c @@ -157,57 +157,6 @@ int bwa_bwtupdate(int argc, char *argv[]) return 0; } -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - -/* this function is not memory efficient, but this will make life easier - Ideally we should also change .amb files as one 'N' in the nucleotide - sequence leads to two ambiguous colors. I may do this later... */ -uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) -{ - uint8_t *pac, *cspac; - bwtint_t i; - int c1, c2; - pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); - cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); - rewind(bns->fp_pac); - c1 = pac[0]>>6; cspac[0] = c1<<6; - for (i = 1; i < bns->l_pac; ++i) { - c2 = pac[i>>2] >> (~i&3)*2 & 3; - cspac[i>>2] |= nst_color_space_table[(1< \n"); - return 1; - } - bns = bns_restore(argv[1]); - cspac = bwa_pac2cspac_core(bns); - bns_dump(bns, argv[2]); - // now write cspac - str = (char*)calloc(strlen(argv[2]) + 5, 1); - strcat(strcpy(str, argv[2]), ".pac"); - fp = xopen(str, "wb"); - fwrite(cspac, 1, bns->l_pac/4 + 1, fp); - ct = bns->l_pac % 4; - fwrite(&ct, 1, 1, fp); - fclose(fp); - bns_destroy(bns); - free(cspac); - return 0; -} - int bwa_bwt2sa(int argc, char *argv[]) { bwt_t *bwt; diff --git a/main.c b/main.c index fc63c2e..dbe9dd0 100644 --- a/main.c +++ b/main.c @@ -27,7 +27,6 @@ static int usage() fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); - fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); fprintf(stderr, "\n"); return 1; } @@ -52,7 +51,6 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); diff --git a/main.h b/main.h index 7b638ca..3e70362 100644 --- a/main.h +++ b/main.h @@ -6,7 +6,6 @@ extern "C" { #endif int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac2cspac(int argc, char *argv[]); int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); int bwa_bwt2sa(int argc, char *argv[]); From 6230f86799f30b836495beea1d81ed2d384c61b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:23:34 -0500 Subject: [PATCH 121/169] merged bwtmisc.c to bwtindex.c bwtmisc.c implements routines related to indexing --- Makefile | 2 +- bwtindex.c | 149 +++++++++++++++++++++++++++++++++++++++++++- bwtmisc.c | 179 ----------------------------------------------------- 3 files changed, 147 insertions(+), 183 deletions(-) delete mode 100644 bwtmisc.c diff --git a/Makefile b/Makefile index f14d906..bfed694 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtmisc.o bwtindex.o bwape.o \ + is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwtindex.c b/bwtindex.c index c01fa95..298153d 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -36,11 +36,154 @@ #include "main.h" #include "utils.h" -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); -void bwa_pac_rev_core(const char *fn, const char *fn_rev); +#ifdef _DIVBWT +#include "divsufsort.h" +#endif -int bwa_index(int argc, char *argv[]) +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) { + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + fseek(fp, -1, SEEK_END); + pac_len = ftell(fp); + fread(&c, 1, 1, fp); + fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + fread(buf2, 1, pac_size, fp); + fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_index(int argc, char *argv[]) // the "index" command +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + char *prefix = 0, *str, *str2, *str3; int c, algo_type = 0, is_64 = 0; clock_t t; diff --git a/bwtmisc.c b/bwtmisc.c deleted file mode 100644 index de96dc2..0000000 --- a/bwtmisc.c +++ /dev/null @@ -1,179 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include "bntseq.h" -#include "utils.h" -#include "main.h" -#include "bwt.h" - -#ifdef _DIVBWT -#include "divsufsort.h" -#endif - -int is_bwt(ubyte_t *T, int n); - -int64_t bwa_seq_len(const char *fn_pac) -{ - FILE *fp; - int64_t pac_len; - ubyte_t c; - fp = xopen(fn_pac, "rb"); - fseek(fp, -1, SEEK_END); - pac_len = ftell(fp); - fread(&c, 1, 1, fp); - fclose(fp); - return (pac_len - 1) * 4 + (int)c; -} - -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) -{ - bwt_t *bwt; - ubyte_t *buf, *buf2; - int i, pac_size; - FILE *fp; - - // initialization - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - bwt->seq_len = bwa_seq_len(fn_pac); - bwt->bwt_size = (bwt->seq_len + 15) >> 4; - fp = xopen(fn_pac, "rb"); - - // prepare sequence - pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)calloc(pac_size, 1); - fread(buf2, 1, pac_size, fp); - fclose(fp); - memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); - for (i = 0; i < bwt->seq_len; ++i) { - buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; - ++bwt->L2[1+buf[i]]; - } - for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; - free(buf2); - - // Burrows-Wheeler Transform - if (use_is) { - bwt->primary = is_bwt(buf, bwt->seq_len); - } else { -#ifdef _DIVBWT - bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); -#else - err_fatal_simple("libdivsufsort is not compiled in."); -#endif - } - bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); - for (i = 0; i < bwt->seq_len; ++i) - bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); - free(buf); - return bwt; -} - -int bwa_pac2bwt(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, use_is = 1; - while ((c = getopt(argc, argv, "d")) >= 0) { - switch (c) { - case 'd': use_is = 0; break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); - return 1; - } - bwt = bwt_pac2bwt(argv[optind], use_is); - bwt_dump_bwt(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} - -#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) - -void bwt_bwtupdate_core(bwt_t *bwt) -{ - bwtint_t i, k, c[4], n_occ; - uint32_t *buf; - - n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt - c[0] = c[1] = c[2] = c[3] = 0; - for (i = k = 0; i < bwt->seq_len; ++i) { - if (i % OCC_INTERVAL == 0) { - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) - } - if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 - ++c[bwt_B00(bwt, i)]; - } - // the last element - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); - // update bwt - free(bwt->bwt); bwt->bwt = buf; -} - -int bwa_bwtupdate(int argc, char *argv[]) -{ - bwt_t *bwt; - if (argc < 2) { - fprintf(stderr, "Usage: bwa bwtupdate \n"); - return 1; - } - bwt = bwt_restore_bwt(argv[1]); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(argv[1], bwt); - bwt_destroy(bwt); - return 0; -} - -int bwa_bwt2sa(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, sa_intv = 32; - while ((c = getopt(argc, argv, "i:")) >= 0) { - switch (c) { - case 'i': sa_intv = atoi(optarg); break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); - return 1; - } - bwt = bwt_restore_bwt(argv[optind]); - bwt_cal_sa(bwt, sa_intv); - bwt_dump_sa(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} From 904c3205c0a7c205da3ccf5ca6fae3860858c80d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 13:26:50 -0500 Subject: [PATCH 122/169] removed a few unused variables These variables have been assigned but never actually used. Reported by gcc-4.7. Lower version cannot give such warnings. --- QSufSort.c | 3 --- bwamem.c | 10 +++++----- bwase.c | 8 ++++---- bwtsw2_aux.c | 4 ++-- stdaln.c | 6 ++---- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/QSufSort.c b/QSufSort.c index e437ac3..36c5a51 100644 --- a/QSufSort.c +++ b/QSufSort.c @@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin qsint_t i, j; qsint_t s, negatedSortedGroupLength; qsint_t numSymbolAggregated; - qsint_t maxNumInputSymbol; qsint_t numSortedPos = 1; qsint_t newAlphabetSize; - maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; - if (!skipTransform) { /* bucketing possible*/ newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, diff --git a/bwamem.c b/bwamem.c index b52d20b..5bd495c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -418,7 +418,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds int i, k; - int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; + int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -432,7 +432,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; - if (t->len > max) max = t->len, max_i = i; + if (t->len > max) max = t->len; } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); @@ -523,7 +523,7 @@ ret_gen_cigar: void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, nn, rid, mid, copy_mate = 0; + int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0; uint32_t *cigar = 0; int64_t pos; bwahit_t ptmp, *p = &ptmp; @@ -548,7 +548,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); kputw(p->flag, str); kputc('\t', str); kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); kputw(p->qual, str); kputc('\t', str); @@ -569,7 +569,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (m && is_mapped(m)) { // then print mate pos and isize pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); + bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); kputc('\t', str); if (mid == rid) kputc('=', str); else kputs(bns->anns[mid].name, str); diff --git a/bwase.c b/bwase.c index 1f36aaa..017322b 100644 --- a/bwase.c +++ b/bwase.c @@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma } rest -= q->l - q->k + 1; } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. - int j, i, k; - for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { + int j, i; + for (j = rest, i = q->l - q->k + 1; j > 0; --j) { double p = 1.0, x = drand48(); while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; @@ -412,11 +412,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in // print mate coordinate if (mate && mate->type != BWA_TYPE_NO_MATCH) { - int m_seqid, m_is_N; + int m_seqid; long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much - m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); + bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 55c7c64..c727984 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -186,14 +186,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 bsw2aux_t *q = b->aux + i; uint8_t *query; bwtint_t k; - int score, path_len, beg, end; + int path_len, beg, end; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); + aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); #if 0 if (name && score != p->G) { // debugging only diff --git a/stdaln.c b/stdaln.c index eb41882..cd064cf 100644 --- a/stdaln.c +++ b/stdaln.c @@ -542,13 +542,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, int start, end, max_score; int thres, *suba, *ss; - int gap_open, gap_ext, b; + int gap_open, gap_ext; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; - b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; thres = _thres > 0? _thres : -_thres; @@ -862,7 +861,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, path_t *path, int *path_len, int G0, uint8_t *_mem) { - int q, r, qr, tmp_len; + int q, r, qr; int32_t **s_array, *score_array; int is_overflow, of_base; uint32_t *eh; @@ -889,7 +888,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2 s_array[i] = (int32_t*)_p, _p += 4 * len1; /* initialization */ aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - tmp_len = len1 + 1; start = 1; end = 2; end_i = end_j = 0; score = 0; From dd85c528d6d34ee594218e98f787a358e1e72de2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 13:59:18 -0500 Subject: [PATCH 123/169] an alternative bwt_invPsi() implementation Cleaner, but not necessarily faster. --- bwt.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/bwt.h b/bwt.h index 2aab9d1..e08872a 100644 --- a/bwt.h +++ b/bwt.h @@ -74,13 +74,6 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; * called bwt_B0 instead of bwt_B */ #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) -// inverse Psi function -#define bwt_invPsi(bwt, k) \ - (((k) == (bwt)->primary)? 0 : \ - ((k) < (bwt)->primary)? \ - (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ - : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) - #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) #ifdef __cplusplus @@ -129,4 +122,21 @@ extern "C" { } #endif +// inverse Psi function +#if 0 +#define bwt_invPsi(bwt, k) \ + (((k) == (bwt)->primary)? 0 : \ + ((k) < (bwt)->primary)? \ + (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ + : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) +#else +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) +{ + register int64_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} +#endif + #endif From a19ab654dfee2963420e1e00db4a82e51b6a18e7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 14:21:19 -0500 Subject: [PATCH 124/169] no effective change --- bwt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.h b/bwt.h index e08872a..e06329a 100644 --- a/bwt.h +++ b/bwt.h @@ -132,7 +132,7 @@ extern "C" { #else static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) { - register int64_t x = k - (k > bwt->primary); + bwtint_t x = k - (k > bwt->primary); x = bwt_B0(bwt, x); x = bwt->L2[x] + bwt_occ(bwt, k, x); return k == bwt->primary? 0 : x; From d460f2ec9e86cf66e475f2ffcbbe81b84210eeef Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 14:48:54 -0500 Subject: [PATCH 125/169] bugfix in multi-threaded bwa-mem --- bwamem.c | 4 ++-- fastmap.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5bd495c..5202fb4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -707,14 +707,14 @@ static void *worker2(void *data) worker_t *w = (worker_t*)data; int i; if (!(w->opt->flag&MEM_F_PE)) { - for (i = 0; i < w->n; i += w->step) { + for (i = w->start; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } } else { int n = 0; - for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet + for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } diff --git a/fastmap.c b/fastmap.c index d52a315..1d6ed04 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,8 +24,9 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); @@ -37,6 +38,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] \n\n"); fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); From 3c330d50494a10afdc016b9717918319108a3128 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:12:26 -0500 Subject: [PATCH 126/169] for another round of code cleanup --- fastmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index 1d6ed04..c97d566 100644 --- a/fastmap.c +++ b/fastmap.c @@ -69,7 +69,7 @@ int main_mem(int argc, char *argv[]) ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } - while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { mem_process_seqs(opt, bwt, bns, pac, n, seqs); free(seqs); } From e613195e172cea20b903ae848fc4bbf238e0a4c9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:30:46 -0500 Subject: [PATCH 127/169] moved some common code to bwa.{c,h} --- Makefile | 14 ++++---- bwa.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 23 +++++++++++++ bwamem.c | 35 ------------------- bwamem.h | 2 +- bwtsw2_aux.c | 1 + utils.c | 58 ++----------------------------- utils.h | 7 ---- 8 files changed, 132 insertions(+), 104 deletions(-) create mode 100644 bwa.c create mode 100644 bwa.h diff --git a/Makefile b/Makefile index bfed694..2029dc1 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ @@ -28,14 +28,16 @@ bwa:libbwa.a $(AOBJS) main.o libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) +QSufSort.o:QSufSort.h +bwt_gen.o:QSufSort.h + +ksw.o:ksw.h +utils.o:utils.h ksort.h kseq.h +bntseq.o:bntseq.h +bwt.o:bwt.h utils.h bwa.o:bwa.h -QSufSort.o:QSufSort.h - -bwt.o:bwt.h -bwtio.o:bwt.h bwtaln.o:bwt.h bwtaln.h kseq.h -bntseq.o:bntseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h diff --git a/bwa.c b/bwa.c new file mode 100644 index 0000000..eca721e --- /dev/null +++ b/bwa.c @@ -0,0 +1,96 @@ +#include +#include +#include "bntseq.h" +#include "bwa.h" +#include "ksw.h" + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} + +// Generate CIGAR when the alignment end points are known +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i, w; + int64_t rlen; + *n_cigar = 0; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; + } + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + w = (int)((double)(l_query * mat[0] - q) / r + 1.); + w = w < 1? w : 1; + w = w < w_? w : w_; + w += abs(rlen - l_query); + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} + diff --git a/bwa.h b/bwa.h new file mode 100644 index 0000000..022b784 --- /dev/null +++ b/bwa.h @@ -0,0 +1,23 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include + +typedef struct { + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwamem.c b/bwamem.c index 5202fb4..43f9f2f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -485,41 +485,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int * Basic hit->SAM conversion * *****************************/ -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) -{ - uint32_t *cigar = 0; - uint8_t tmp, *rseq; - int i, w; - int64_t rlen; - *n_cigar = 0; - if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range - if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - for (i = 0; i < rlen>>1; ++i) - tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; - } - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - // set the band-width - w = (int)((double)(l_query * mat[0] - q) / r + 1.); - w = w < 1? w : 1; - w = w < w_? w : w_; - w += abs(rlen - l_query); - // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); - if (rb >= l_pac) // reverse back query - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - -ret_gen_cigar: - free(rseq); - return cigar; -} - - void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) diff --git a/bwamem.h b/bwamem.h index 4319911..ce27c6e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,7 +3,7 @@ #include "bwt.h" #include "bntseq.h" -#include "utils.h" +#include "bwa.h" #define MEM_MAPQ_COEF 40.0 #define MEM_MAPQ_MAX 60 diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index c727984..bc12d20 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,6 +13,7 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bwa.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/utils.c b/utils.c index 1cebaab..20b09ee 100644 --- a/utils.c +++ b/utils.c @@ -40,6 +40,9 @@ KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) + /******************** * System utilities * ********************/ @@ -160,58 +163,3 @@ double realtime() gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } - -/************************ - * Batch FASTA/Q reader * - ************************/ - -#include "kseq.h" -KSEQ_INIT2(, gzFile, gzread) - -static inline void trim_readno(kstring_t *s) -{ - if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) - s->l -= 2, s->s[s->l] = 0; -} - -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) -{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; - s->seq = strdup(ks->seq.s); - s->qual = ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = strlen(s->seq); -} - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; - int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; - while (kseq_read(ks) >= 0) { - if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads - fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); - break; - } - if (n >= m) { - m = m? m<<1 : 256; - seqs = realloc(seqs, m * sizeof(bseq1_t)); - } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - size += seqs[n++].l_seq; - } - if (size >= chunk_size) break; - } - if (size == 0) { // test if the 2nd file is finished - if (ks2 && kseq_read(ks2) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); - } - *n_ = n; - return seqs; -} diff --git a/utils.h b/utils.h index 70f4e11..a3db251 100644 --- a/utils.h +++ b/utils.h @@ -52,11 +52,6 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; -typedef struct { - int l_seq; - char *name, *comment, *seq, *qual, *sam; -} bseq1_t; - #ifdef __cplusplus extern "C" { #endif @@ -80,8 +75,6 @@ extern "C" { void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - #ifdef __cplusplus } #endif From 67543f19a1415c8e9a55f981085a9971771c3cfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:55:55 -0500 Subject: [PATCH 128/169] code refactoring --- bwa.c | 37 +++++++++++++++++++++++++++++++++++++ bwa.h | 16 ++++++++++++++++ bwamem.c | 1 - fastmap.c | 54 +++++++++++++++--------------------------------------- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/bwa.c b/bwa.c index eca721e..e8735b5 100644 --- a/bwa.c +++ b/bwa.c @@ -94,3 +94,40 @@ ret_gen_cigar: return cigar; } +/********************* + * Full index reader * + *********************/ + +bwaidx_t *bwa_idx_load(const char *prefix, int which) +{ + bwaidx_t *idx; + idx = calloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) { + char *tmp; + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + idx->bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, idx->bwt); + free(tmp); + } + if (which & BWA_IDX_BNS) { + idx->bns = bns_restore(prefix); + if (which & BWA_IDX_PAC) { + idx->pac = calloc(idx->bns->l_pac/4+1, 1); + fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + } + fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; + } + return idx; +} + +void bwa_idx_destroy(bwaidx_t *idx) +{ + if (idx == 0) return; + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + free(idx); +} diff --git a/bwa.h b/bwa.h index 022b784..ad528c9 100644 --- a/bwa.h +++ b/bwa.h @@ -2,6 +2,19 @@ #define BWA_H_ #include +#include "bntseq.h" +#include "bwt.h" + +#define BWA_IDX_BWT 0x1 +#define BWA_IDX_BNS 0x2 +#define BWA_IDX_PAC 0x4 +#define BWA_IDX_ALL 0x7 + +typedef struct { + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base +} bwaidx_t; typedef struct { int l_seq; @@ -16,6 +29,9 @@ extern "C" { uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + bwaidx_t *bwa_idx_load(const char *prefix, int which); + void bwa_idx_destroy(bwaidx_t *idx); + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 43f9f2f..6b219cf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -112,7 +112,6 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } - const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) { int i, max, max_i, ori_start; diff --git a/fastmap.c b/fastmap.c index c97d566..2800821 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,8 +2,7 @@ #include #include #include -#include "bntseq.h" -#include "bwt.h" +#include "bwa.h" #include "bwamem.h" #include "kvec.h" #include "utils.h" @@ -15,13 +14,11 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - bwt_t *bwt; - bntseq_t *bns; int c, n, l; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; - uint8_t *pac = 0; bseq1_t *seqs; + bwaidx_t *idx; opt = mem_opt_init(); while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { @@ -48,19 +45,9 @@ int main_mem(int argc, char *argv[]) return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - pac = calloc(bns->l_pac/4+1, 1); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); - } - for (l = 0; l < bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + idx = bwa_idx_load(argv[optind], BWA_IDX_ALL); + for (l = 0; l < idx->bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); @@ -70,13 +57,12 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { - mem_process_seqs(opt, bwt, bns, pac, n, seqs); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); free(seqs); } - free(opt); free(pac); - bns_destroy(bns); - bwt_destroy(bwt); + free(opt); + bwa_idx_destroy(idx); kseq_destroy(ks); gzclose(fp); if (ks2) { @@ -92,10 +78,9 @@ int main_fastmap(int argc, char *argv[]) kseq_t *seq; bwtint_t k; gzFile fp; - bwt_t *bwt; - bntseq_t *bns; smem_i *itr; const bwtintv_v *a; + bwaidx_t *idx; while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { @@ -112,16 +97,8 @@ int main_fastmap(int argc, char *argv[]) fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - } - itr = smem_itr_init(bwt); + idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS); + itr = smem_itr_init(idx->bwt); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { @@ -141,10 +118,10 @@ int main_fastmap(int argc, char *argv[]) bwtint_t pos; int len, is_rev, ref_id; len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); if (is_rev) pos -= len - 1; - bns_cnt_ambi(bns, pos, len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + bns_cnt_ambi(idx->bns, pos, len, &ref_id); + printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); } } else fputs("\t*", stdout); putchar('\n'); @@ -154,8 +131,7 @@ int main_fastmap(int argc, char *argv[]) } smem_itr_destroy(itr); - bns_destroy(bns); - bwt_destroy(bwt); + bwa_idx_destroy(idx); kseq_destroy(seq); gzclose(fp); return 0; From cfa7165036a83fffe973dbc25a4ed9406287bf97 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:10:48 -0500 Subject: [PATCH 129/169] cleanup index loading code --- bwa.c | 67 ++++++++++++++++++++++++++++++++++++++++++--------- bwa.h | 6 ++++- bwape.c | 4 +-- bwase.c | 4 +-- bwtaln.c | 29 ++-------------------- bwtsw2_main.c | 23 +++++------------- 6 files changed, 72 insertions(+), 61 deletions(-) diff --git a/bwa.c b/bwa.c index e8735b5..fac0db7 100644 --- a/bwa.c +++ b/bwa.c @@ -4,6 +4,8 @@ #include "bwa.h" #include "ksw.h" +int bwa_verbose = 3; + /************************ * Batch FASTA/Q reader * ************************/ @@ -98,28 +100,69 @@ ret_gen_cigar: * Full index reader * *********************/ -bwaidx_t *bwa_idx_load(const char *prefix, int which) +char *bwa_idx_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = malloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + fclose(fp); + prefix[l_hint + 3] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + fclose(fp); + prefix[l_hint] = 0; + return prefix; + } + } +} + +bwt_t *bwa_idx_load_bwt(const char *hint) +{ + char *tmp, *prefix; + bwt_t *bwt; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) + fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, bwt); + free(tmp); free(prefix); + return bwt; +} + +bwaidx_t *bwa_idx_load(const char *hint, int which) { bwaidx_t *idx; + char *prefix; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) return 0; idx = calloc(1, sizeof(bwaidx_t)); - if (which & BWA_IDX_BWT) { - char *tmp; - tmp = calloc(strlen(prefix) + 5, 1); - strcat(strcpy(tmp, prefix), ".bwt"); // FM-index - idx->bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) - bwt_restore_sa(tmp, idx->bwt); - free(tmp); - } + if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { idx->bns = bns_restore(prefix); if (which & BWA_IDX_PAC) { idx->pac = calloc(idx->bns->l_pac/4+1, 1); fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; } - fclose(idx->bns->fp_pac); - idx->bns->fp_pac = 0; } + free(prefix); return idx; } diff --git a/bwa.h b/bwa.h index ad528c9..b5eda13 100644 --- a/bwa.h +++ b/bwa.h @@ -21,6 +21,8 @@ typedef struct { char *name, *comment, *seq, *qual, *sam; } bseq1_t; +extern int bwa_verbose; + #ifdef __cplusplus extern "C" { #endif @@ -29,7 +31,9 @@ extern "C" { uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); - bwaidx_t *bwa_idx_load(const char *prefix, int which); + char *bwa_idx_infer_prefix(const char *hint); + bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); #ifdef __cplusplus diff --git a/bwape.c b/bwape.c index 77ae1fa..87393b1 100644 --- a/bwape.c +++ b/bwape.c @@ -10,6 +10,7 @@ #include "utils.h" #include "stdaln.h" #include "bwase.h" +#include "bwa.h" typedef struct { int n; @@ -716,7 +717,6 @@ int bwa_sai2sam_pe(int argc, char *argv[]) { extern char *bwa_rg_line, *bwa_rg_id; extern int bwa_set_rg(const char *s); - extern char *bwa_infer_prefix(const char *hint); int c; pe_opt_t *popt; char *prefix; @@ -762,7 +762,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) fprintf(stderr, "\n"); return 1; } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(bwa_rg_line); free(bwa_rg_id); return 0; diff --git a/bwase.c b/bwase.c index 017322b..8f50c7a 100644 --- a/bwase.c +++ b/bwase.c @@ -10,6 +10,7 @@ #include "bntseq.h" #include "utils.h" #include "kstring.h" +#include "bwa.h" int g_log_n[256]; char *bwa_rg_line, *bwa_rg_id; @@ -606,7 +607,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { - extern char *bwa_infer_prefix(const char *hint); int c, n_occ = 3; char *prefix; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { @@ -628,7 +628,7 @@ int bwa_sai2sam_se(int argc, char *argv[]) fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(bwa_rg_line); free(bwa_rg_id); return 0; diff --git a/bwtaln.c b/bwtaln.c index 84be510..96d4026 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -11,6 +11,7 @@ #include "bwtaln.h" #include "bwtgap.h" #include "utils.h" +#include "bwa.h" #ifdef HAVE_PTHREAD #include @@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_seq_close(ks); } -char *bwa_infer_prefix(const char *hint) -{ - char *prefix; - int l_hint; - FILE *fp; - l_hint = strlen(hint); - prefix = malloc(l_hint + 3 + 4 + 1); - strcpy(prefix, hint); - strcpy(prefix + l_hint, ".64.bwt"); - if ((fp = fopen(prefix, "rb")) != 0) { - fclose(fp); - prefix[l_hint + 3] = 0; - return prefix; - } else { - strcpy(prefix + l_hint, ".bwt"); - if ((fp = fopen(prefix, "rb")) == 0) { - free(prefix); - return 0; - } else { - fclose(fp); - prefix[l_hint] = 0; - return prefix; - } - } -} - int bwa_aln(int argc, char *argv[]) { int c, opte = -1; @@ -328,7 +303,7 @@ int bwa_aln(int argc, char *argv[]) k = l; } } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(opt); return 0; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index e3f57f8..ab126f2 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -6,14 +6,12 @@ #include "bwt.h" #include "bwtsw2.h" #include "utils.h" +#include "bwa.h" int bwa_bwtsw2(int argc, char *argv[]) { - extern char *bwa_infer_prefix(const char *hint); bsw2opt_t *opt; - bwt_t *target; - char buf[1024], *prefix; - bntseq_t *bns; + bwaidx_t *idx; int c; opt = bsw2_init_opt(); @@ -81,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { - fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 0; - } - strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target); - bns = bns_restore(prefix); - - bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); - - bns_destroy(bns); - bwt_destroy(target); - free(opt); free(prefix); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0; + bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); + bwa_idx_destroy(idx); + free(opt); return 0; } From ee4540c3948db2f357301fbf9f7e44f41a80bcfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:41:44 -0500 Subject: [PATCH 130/169] support read group in bwa-mem --- bwa.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- bwa.h | 4 ++++ bwamem.c | 1 + bwape.c | 19 +++++----------- bwase.c | 59 +++++-------------------------------------------- fastmap.c | 15 ++++++++----- 6 files changed, 88 insertions(+), 76 deletions(-) diff --git a/bwa.c b/bwa.c index fac0db7..f5e8692 100644 --- a/bwa.c +++ b/bwa.c @@ -1,10 +1,13 @@ +#include #include #include #include "bntseq.h" #include "bwa.h" #include "ksw.h" +#include "utils.h" int bwa_verbose = 3; +char bwa_rg_id[256]; /************************ * Batch FASTA/Q reader * @@ -132,8 +135,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) bwt_t *bwt; prefix = bwa_idx_infer_prefix(hint); if (prefix == 0) { - if (bwa_verbose >= 1) - fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } tmp = calloc(strlen(prefix) + 5, 1); @@ -150,7 +152,10 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) bwaidx_t *idx; char *prefix; prefix = bwa_idx_infer_prefix(hint); - if (prefix == 0) return 0; + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { @@ -174,3 +179,58 @@ void bwa_idx_destroy(bwaidx_t *idx) if (idx->pac) free(idx->pac); free(idx); } + +/*********************** + * SAM header routines * + ***********************/ + +void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) +{ + int i; + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (rg_line) err_printf("%s\n", rg_line); +} + +static char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +char *bwa_set_rg(const char *s) +{ + char *p, *q, *r, *rg_line = 0; + memset(bwa_rg_id, 0, 256); + if (strstr(s, "@RG") != s) return 0; + rg_line = strdup(s); + bwa_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return rg_line; + +err_set_rg: + free(rg_line); + return 0; +} + diff --git a/bwa.h b/bwa.h index b5eda13..208db6a 100644 --- a/bwa.h +++ b/bwa.h @@ -22,6 +22,7 @@ typedef struct { } bseq1_t; extern int bwa_verbose; +extern char bwa_rg_id[256]; #ifdef __cplusplus extern "C" { @@ -36,6 +37,9 @@ extern "C" { bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); + void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); + char *bwa_set_rg(const char *s); + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 6b219cf..ce55cad 100644 --- a/bwamem.c +++ b/bwamem.c @@ -571,6 +571,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } kputc('\n', str); free(cigar); #undef is_mapped diff --git a/bwape.c b/bwape.c index 87393b1..0b2b8d6 100644 --- a/bwape.c +++ b/bwape.c @@ -611,7 +611,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, return pacseq; } -void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, j, n_seqs, tot_seqs = 0; @@ -654,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } // core loop - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); bwa_print_sam_PG(); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; @@ -715,20 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f int bwa_sai2sam_pe(int argc, char *argv[]) { - extern char *bwa_rg_line, *bwa_rg_id; - extern int bwa_set_rg(const char *s); int c; pe_opt_t *popt; - char *prefix; + char *prefix, *rg_line = 0; popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'a': popt->max_isize = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break; @@ -764,11 +759,9 @@ int bwa_sai2sam_pe(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt); - free(bwa_rg_line); free(bwa_rg_id); free(prefix); - free(popt); + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); + free(prefix); free(popt); return 0; } diff --git a/bwase.c b/bwase.c index 8f50c7a..27da794 100644 --- a/bwase.c +++ b/bwase.c @@ -13,7 +13,6 @@ #include "bwa.h" int g_log_n[256]; -char *bwa_rg_line, *bwa_rg_id; void bwa_print_sam_PG(); @@ -490,56 +489,13 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } -void bwa_print_sam_SQ(const bntseq_t *bns) -{ - int i; - for (i = 0; i < bns->n_seqs; ++i) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); -} - void bwase_initialize() { int i; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); } -char *bwa_escape(char *s) -{ - char *p, *q; - for (p = q = s; *p; ++p) { - if (*p == '\\') { - ++p; - if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; - else if (*p == '\\') *q++ = '\\'; - } else *q++ = *p; - } - *q = '\0'; - return s; -} - -int bwa_set_rg(const char *s) -{ - char *p, *q, *r; - if (strstr(s, "@RG") != s) return -1; - if (bwa_rg_line) free(bwa_rg_line); - if (bwa_rg_id) free(bwa_rg_id); - bwa_rg_line = strdup(s); - bwa_rg_id = 0; - bwa_escape(bwa_rg_line); - p = strstr(bwa_rg_line, "\tID:"); - if (p == 0) return -1; - p += 4; - for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - bwa_rg_id = calloc(q - p + 1, 1); - for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) - *r++ = *q; - return 0; -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, n_seqs, tot_seqs = 0, m_aln; @@ -559,7 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); //bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); @@ -608,15 +564,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { int c, n_occ = 3; - char *prefix; + char *prefix, *rg_line = 0; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; @@ -630,10 +583,8 @@ int bwa_sai2sam_se(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ); - free(bwa_rg_line); free(bwa_rg_id); + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); return 0; } diff --git a/fastmap.c b/fastmap.c index 2800821..adbe04c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -14,14 +14,15 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int c, n, l; + int c, n; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; + char *rg_line = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; @@ -29,7 +30,9 @@ int main_mem(int argc, char *argv[]) else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); - else if (c == 's') opt->split_width = atoi(optarg); + else if (c == 'R') { + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -39,15 +42,15 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - idx = bwa_idx_load(argv[optind], BWA_IDX_ALL); - for (l = 0; l < idx->bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + bwa_print_sam_hdr(idx->bns, rg_line); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); From 33236de32e132440f7f4202bbaee031464adb69a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:44:02 -0500 Subject: [PATCH 131/169] a bit more error message --- bwa.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index f5e8692..aff9aff 100644 --- a/bwa.c +++ b/bwa.c @@ -212,17 +212,20 @@ char *bwa_set_rg(const char *s) { char *p, *q, *r, *rg_line = 0; memset(bwa_rg_id, 0, 256); - if (strstr(s, "@RG") != s) return 0; + if (strstr(s, "@RG") != s) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + goto err_set_rg; + } rg_line = strdup(s); bwa_escape(rg_line); if ((p = strstr(rg_line, "\tID:")) == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); goto err_set_rg; } p += 4; for (q = p; *q && *q != '\t' && *q != '\n'; ++q); if (q - p + 1 > 256) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); goto err_set_rg; } for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) From b4c38bcc1c8e54657a32537a4aa0e4b4a808f725 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:57:34 -0500 Subject: [PATCH 132/169] append fasta/q comment --- bwa.c | 2 +- bwamem.c | 7 +++---- bwamem_pair.c | 2 +- fastmap.c | 18 ++++++++++++++---- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/bwa.c b/bwa.c index aff9aff..c8400b1 100644 --- a/bwa.c +++ b/bwa.c @@ -25,7 +25,7 @@ static inline void trim_readno(kstring_t *s) static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; + s->comment = ks->comment.l? strdup(ks->comment.s) : 0; s->seq = strdup(ks->seq.s); s->qual = ks->qual.l? strdup(ks->qual.s) : 0; s->l_seq = strlen(s->seq); diff --git a/bwamem.c b/bwamem.c index ce55cad..553fe1c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,8 +13,6 @@ #include "kvec.h" #include "ksort.h" -int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging - void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; @@ -462,7 +460,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; @@ -572,6 +570,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); free(cigar); #undef is_mapped @@ -633,7 +632,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - if (mem_verbose >= 4) mem_print_chain(bns, &chn); + if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); diff --git a/bwamem_pair.c b/bwamem_pair.c index 3dce119..57a128a 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -47,7 +47,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } - if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; uint64_v *q = &isize[d]; diff --git a/fastmap.c b/fastmap.c index adbe04c..437192a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -14,7 +14,7 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int c, n; + int i, c, n, copy_comment = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; @@ -22,14 +22,15 @@ int main_mem(int argc, char *argv[]) char *rg_line = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "CPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); - else if (c == 'v') mem_verbose = atoi(optarg); + else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); + else if (c == 'C') copy_comment = 1; else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); @@ -43,7 +44,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); - fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); + fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, "\n"); free(opt); return 1; @@ -60,6 +62,14 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { + int64_t size = 0; + if (!copy_comment) + for (i = 0; i < n; ++i) { + free(seqs[i].comment); seqs[i].comment = 0; + } + for (i = 0; i < n; ++i) size += seqs[i].l_seq; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); free(seqs); } From 6e7903e9f33e15890832e354759d658000a0fe22 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 17:09:23 -0500 Subject: [PATCH 133/169] added kopen support --- Makefile | 2 +- fastmap.c | 16 ++- kopen.c | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 355 insertions(+), 6 deletions(-) create mode 100644 kopen.c diff --git a/Makefile b/Makefile index 2029dc1..f1da07e 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/fastmap.c b/fastmap.c index 437192a..90307b3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,15 +11,19 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; +void *kopen(const char *fn, int *_fd); +int kclose(void *a); + int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int i, c, n, copy_comment = 0; + int fd, fd2, i, c, n, copy_comment = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; char *rg_line = 0; + void *ko = 0, *ko2 = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "CPHk:c:v:s:r:t:R:")) >= 0) { @@ -54,10 +58,12 @@ int main_mem(int argc, char *argv[]) if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); - fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); + ko = kopen(argv[optind + 1], &fd); + fp = gzdopen(fd, "r"); ks = kseq_init(fp); if (optind + 2 < argc) { - fp2 = gzopen(argv[optind + 2], "r"); + ko2 = kopen(argv[optind + 2], &fd2); + fp2 = gzdopen(fd2, "r"); ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } @@ -77,10 +83,10 @@ int main_mem(int argc, char *argv[]) free(opt); bwa_idx_destroy(idx); kseq_destroy(ks); - gzclose(fp); + gzclose(fp); kclose(ko); if (ks2) { kseq_destroy(ks2); - gzclose(fp2); + gzclose(fp2); kclose(ko2); } return 0; } diff --git a/kopen.c b/kopen.c new file mode 100644 index 0000000..f72735c --- /dev/null +++ b/kopen.c @@ -0,0 +1,343 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#include +#endif + +#ifdef _WIN32 +#define _KO_NO_NET +#endif + +#ifndef _KO_NO_NET +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +#undef __err_connect +} + +static int http_open(const char *fn) +{ + char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; + int fd, ret, l; + + /* parse URL; adapted from khttp_parse_url() in knetfile.c */ + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + http_host = calloc(l + 1, 1); + strncpy(http_host, fn + 7, l); + http_host[l] = 0; + for (q = http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set host, port and path + if (proxy == 0) { + host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. + port = strdup(*q? q : "80"); + path = strdup(*p? p : "/"); + } else { + host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + port = strdup(*q? q : "80"); + path = strdup(fn); + } + + /* connect; adapted from khttp_connect() in knetfile.c */ + l = 0; + fd = socket_connect(host, port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", path, http_host); + l += sprintf(buf + l, "\r\n"); + write(fd, buf, l); + l = 0; + while (read(fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + close(fd); + fd = -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret != 200) { + close(fd); + fd = -1; + } + free(buf); free(http_host); free(host); free(port); free(path); + return fd; +} + +typedef struct { + int max_response, ctrl_fd; + char *response; +} ftpaux_t; + +static int kftp_get_response(ftpaux_t *aux) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; + while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + if (n >= aux->max_response) { + aux->max_response = aux->max_response? aux->max_response<<1 : 256; + aux->response = realloc(aux->response, aux->max_response); + } + aux->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) + && aux->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + aux->response[n-2] = 0; + return strtol(aux->response, &p, 0); +} + +static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) +{ + if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + write(aux->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(aux) : 0; +} + +static int ftp_open(const char *fn) +{ + char *p, *host = 0, *port = 0, *retr = 0; + char host2[80], port2[10]; + int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; + ftpaux_t aux; + + /* parse URL */ + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + port = strdup("21"); + host = calloc(l + 1, 1); + strncpy(host, fn + 6, l); + retr = calloc(strlen(p) + 8, 1); + sprintf(retr, "RETR %s\r\n", p); + + /* connect to ctrl */ + memset(&aux, 0, sizeof(ftpaux_t)); + aux.ctrl_fd = socket_connect(host, port); + if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ + + /* connect to the data stream */ + kftp_get_response(&aux); + kftp_send_cmd(&aux, "USER anonymous\r\n", 1); + kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); + kftp_send_cmd(&aux, "TYPE I\r\n", 1); + kftp_send_cmd(&aux, "PASV\r\n", 1); + for (p = aux.response; *p && *p != '('; ++p); + if (*p != '(') goto ftp_open_end; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(pasv_ip, v, 4 * sizeof(int)); + pasv_port = (v[4]<<8&0xff00) + v[5]; + kftp_send_cmd(&aux, retr, 0); + sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); + sprintf(port2, "%d", pasv_port); + fd = socket_connect(host2, port2); + if (fd == -1) goto ftp_open_end; + ret = kftp_get_response(&aux); + if (ret != 150) { + close(fd); + fd = -1; + } + close(aux.ctrl_fd); + +ftp_open_end: + free(host); free(port); free(retr); free(aux.response); + return fd; +} +#endif /* !defined(_KO_NO_NET) */ + +static char **cmd2argv(const char *cmd) +{ + int i, beg, end, argc; + char **argv, *p, *q, *str; + end = strlen(cmd); + for (i = end - 1; i >= 0; --i) + if (!isspace(cmd[i])) break; + end = i + 1; + for (beg = 0; beg < end; ++beg) + if (!isspace(cmd[beg])) break; + if (beg == end) return 0; + for (i = beg + 1, argc = 0; i < end; ++i) + if (isspace(cmd[i]) && !isspace(cmd[i-1])) + ++argc; + argv = (char**)calloc(argc + 2, sizeof(void*)); + argv[0] = str = (char*)calloc(end - beg + 1, 1); + strncpy(argv[0], cmd + beg, end - beg); + for (i = argc = 1, q = p = str; i < end - beg; ++i) + if (isspace(str[i])) str[i] = 0; + else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; + return argv; +} + +#define KO_STDIN 1 +#define KO_FILE 2 +#define KO_PIPE 3 +#define KO_HTTP 4 +#define KO_FTP 5 + +typedef struct { + int type, fd; + pid_t pid; +} koaux_t; + +void *kopen(const char *fn, int *_fd) +{ + koaux_t *aux = 0; + *_fd = -1; + if (strstr(fn, "http://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_HTTP; + aux->fd = http_open(fn); + } else if (strstr(fn, "ftp://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FTP; + aux->fd = ftp_open(fn); + } else if (strcmp(fn, "-") == 0) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_STDIN; + aux->fd = STDIN_FILENO; + } else { + const char *p, *q; + for (p = fn; *p; ++p) + if (!isspace(*p)) break; + if (*p == '<') { // pipe open + int need_shell, pfd[2]; + pid_t pid; + // a simple check to see if we need to invoke a shell; not always working + for (q = p + 1; *q; ++q) + if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') + break; + need_shell = (*q != 0); + pipe(pfd); + pid = vfork(); + if (pid == -1) { /* vfork() error */ + close(pfd[0]); close(pfd[1]); + return 0; + } + if (pid == 0) { /* the child process */ + char **argv; /* FIXME: I do not know if this will lead to a memory leak */ + close(pfd[0]); + dup2(pfd[1], STDOUT_FILENO); + close(pfd[1]); + if (!need_shell) { + argv = cmd2argv(p + 1); + execvp(argv[0], argv); + free(argv[0]); free(argv); + } else execl("/bin/sh", "sh", "-c", p + 1, NULL); + exit(1); + } else { /* parent process */ + close(pfd[1]); + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_PIPE; + aux->fd = pfd[0]; + aux->pid = pid; + } + } else { +#ifdef _WIN32 + *_fd = open(fn, O_RDONLY | O_BINARY); +#else + *_fd = open(fn, O_RDONLY); +#endif + if (*_fd) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FILE; + aux->fd = *_fd; + } + } + } + *_fd = aux->fd; + return aux; +} + +int kclose(void *a) +{ + koaux_t *aux = (koaux_t*)a; + if (aux->type == KO_PIPE) { + int status; + pid_t pid; + pid = waitpid(aux->pid, &status, WNOHANG); + if (pid != aux->pid) kill(aux->pid, 15); + } + return 0; +} + +#ifdef _KO_MAIN +#define BUF_SIZE 0x10000 +int main(int argc, char *argv[]) +{ + void *x; + int l, fd; + unsigned char buf[BUF_SIZE]; + FILE *fp; + if (argc == 1) { + fprintf(stderr, "Usage: kopen \n"); + return 1; + } + x = kopen(argv[1], &fd); + fp = fdopen(fd, "r"); + if (fp == 0) { + fprintf(stderr, "ERROR: fail to open the input\n"); + return 1; + } + do { + if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) + fwrite(buf, 1, l, stdout); + } while (l == BUF_SIZE); + fclose(fp); + kclose(x); + return 0; +} +#endif From cda85be059ad845edaf89d020c0d5edd35f04187 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 17:15:07 -0500 Subject: [PATCH 134/169] fixed a couple bugs identified by gcc Recent gcc is better. --- bwamem.c | 2 +- bwase.c | 4 ++-- kopen.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 553fe1c..7daa5d2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -569,7 +569,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); free(cigar); diff --git a/bwase.c b/bwase.c index 27da794..2dd783b 100644 --- a/bwase.c +++ b/bwase.c @@ -434,7 +434,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { @@ -482,7 +482,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); putchar('\n'); diff --git a/kopen.c b/kopen.c index f72735c..8c191bc 100644 --- a/kopen.c +++ b/kopen.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifndef _WIN32 #include From ee59a131094ec3d0576bd72ed6421fc15b655397 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 12:17:29 -0500 Subject: [PATCH 135/169] simplified bwamem.h Hide mem_seed_t and mem_chain_t. Don't expose unnecessary routines. --- bwamem.c | 17 +++++++++++++---- bwamem.h | 44 ++++++++------------------------------------ bwamem_pair.c | 9 +++++++++ 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7daa5d2..b6741d2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -155,6 +155,19 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) * Chaining while finding SMEMs * ********************************/ +typedef struct { + int64_t rbeg; + int32_t qbeg, len; +} mem_seed_t; + +typedef struct { + int n, m; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + #include "kbtree.h" #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) @@ -398,10 +411,6 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT free(z.a); } -/************************ - * Pick paired-end hits * - ************************/ - /**************************************** * Construct the alignment from a chain * ****************************************/ diff --git a/bwamem.h b/bwamem.h index ce27c6e..27a3dc1 100644 --- a/bwamem.h +++ b/bwamem.h @@ -11,11 +11,6 @@ struct __smem_i; typedef struct __smem_i smem_i; -typedef struct { - int64_t rbeg; - int32_t qbeg, len; -} mem_seed_t; - #define MEM_F_HARDCLIP 0x1 #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 @@ -35,12 +30,6 @@ typedef struct { int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; -typedef struct { - int n, m; - int64_t pos; - mem_seed_t *seeds; -} mem_chain_t; - typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain @@ -60,43 +49,26 @@ typedef struct { int score, sub; } bwahit_t; -typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; -extern int mem_verbose; - #ifdef __cplusplus extern "C" { #endif -smem_i *smem_itr_init(const bwt_t *bwt); -void smem_itr_destroy(smem_i *itr); -void smem_set_query(smem_i *itr, int len, const uint8_t *query); -const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int len, const uint8_t *query); + const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); -mem_opt_t *mem_opt_init(void); -void mem_fill_scmat(int a, int b, int8_t mat[25]); + mem_opt_t *mem_opt_init(void); + void mem_fill_scmat(int a, int b, int8_t mat[25]); -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *a); -uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); -int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); - -void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); #ifdef __cplusplus } #endif -static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) -{ - int64_t p2; - int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); - p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand - *dist = p2 > b1? p2 - b1 : b1 - p2; - return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); -} - #endif diff --git a/bwamem_pair.c b/bwamem_pair.c index 57a128a..51f51c9 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -15,6 +15,15 @@ #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; From 6bdccf2a8acf8c8b4d6c397ab0c6e9b7f9906ae7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:09:29 -0500 Subject: [PATCH 136/169] added a bit documentation --- bwa.h | 1 + bwamem.c | 16 +++++++------- bwamem.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++----- fastmap.c | 2 +- 4 files changed, 68 insertions(+), 14 deletions(-) diff --git a/bwa.h b/bwa.h index 208db6a..d4ca807 100644 --- a/bwa.h +++ b/bwa.h @@ -34,6 +34,7 @@ extern "C" { char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); diff --git a/bwamem.c b/bwamem.c index b6741d2..4fffe38 100644 --- a/bwamem.c +++ b/bwamem.c @@ -6,6 +6,7 @@ #ifdef HAVE_PTHREAD #include #endif + #include "kstring.h" #include "bwamem.h" #include "bntseq.h" @@ -632,19 +633,19 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i, j; mem_chain_v chn; mem_alnreg_v regs, tmp; - for (i = 0; i < s->l_seq; ++i) - s->seq[i] = nst_nt4_table[(int)s->seq[i]]; - chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); + for (i = 0; i < l_seq; ++i) + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, &chn.a[i], &tmp); for (j = 0; j < tmp.n; ++j) kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); @@ -670,7 +671,7 @@ static void *worker1(void *data) worker_t *w = (worker_t*)data; int i; for (i = w->start; i < w->n; i += w->step) - w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); return 0; } @@ -696,7 +697,7 @@ static void *worker2(void *data) return 0; } -int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; worker_t *w; @@ -737,5 +738,4 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } free(regs); free(w); - return 0; } diff --git a/bwamem.h b/bwamem.h index 27a3dc1..fa55b44 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,10 +31,14 @@ typedef struct { } mem_opt_t; typedef struct { - int64_t rb, re; - int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n; // approximate number of suboptimal hits - int secondary; // non-negative if the hit is secondary + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int score; // best SW score + int sub; // 2nd best SW score + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; typedef struct { @@ -63,8 +67,57 @@ extern "C" { mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); - int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + /** + * Align a batch of sequences and generate the alignments in the SAM format + * + * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. + * Note that $seqs[i].sam may consist of several SAM lines if the + * corresponding sequence has multiple primary hits. + * + * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query + * sequences must be interleaved: $n must be an even number and the 2i-th + * sequence and the (2i+1)-th sequence constitute a read pair. In this + * mode, there should be enough (typically >50) unique pairs for the + * routine to infer the orientation and insert size. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param n number of query sequences + * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + */ + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + /** + * Find the aligned regions for one query sequence + * + * Note that this routine does not generate CIGAR. CIGAR should be + * generated later by bwa_gen_cigar() defined in bwa.c. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied + * + * @return list of aligned regions. + */ + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + + /** + * Infer the insert size distribution from interleaved alignment regions + * + * This function can be called after mem_align1(), as long as paired-end + * reads are properly interleaved. + * + * @param opt alignment parameters + * @param l_pac length of concatenated reference sequence + * @param n number of query sequences; must be an even number + * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair + * @param pes inferred insert size distribution (output) + */ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 90307b3..819e301 100644 --- a/fastmap.c +++ b/fastmap.c @@ -67,7 +67,7 @@ int main_mem(int argc, char *argv[]) ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } - while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { int64_t size = 0; if (!copy_comment) for (i = 0; i < n; ++i) { From 85775c338432818d1b05805a0357b194e634cb2c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:23:43 -0500 Subject: [PATCH 137/169] output multiple hits --- bwamem.c | 10 ++++++---- bwamem.h | 1 + fastmap.c | 6 +++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4fffe38..ae1886a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -554,7 +554,9 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } else kputw(0, str); kputc('\t', str); } else kputsn("\t*\t0\t0\t", 7, str); - if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand + if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand int i, qb = 0, qe = s->l_seq; if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; ks_resize(str, str->l + (qe - qb) + 1); @@ -610,7 +612,7 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) { h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; h->score = a->score; - h->sub = a->sub > a->csub? a->sub : a->csub; + h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub; h->qual = 0; // quality unset h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set } @@ -623,10 +625,10 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (a->n > 0) { for (k = 0; k < a->n; ++k) { bwahit_t h; - if (a->a[k].secondary >= 0) continue; + if (a->a[k].secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; - h.qual = mem_approx_mapq_se(opt, &a->a[k]); + h.qual = a->a[k].secondary >= 0? 0 : mem_approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); diff --git a/bwamem.h b/bwamem.h index fa55b44..5cf3ac5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -14,6 +14,7 @@ typedef struct __smem_i smem_i; #define MEM_F_HARDCLIP 0x1 #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 +#define MEM_F_ALL 0x8 typedef struct { int a, b, q, r, w; diff --git a/fastmap.c b/fastmap.c index 819e301..49c46fd 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,11 +26,12 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "CPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "aCPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; + else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); @@ -49,6 +50,9 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -P perform mate SW only but skip pairing\n"); + fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, "\n"); free(opt); From 0b4a40dc25f3191bb1af289af03f568fb6563de3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:34:20 -0500 Subject: [PATCH 138/169] updated revision number; to merge into master --- fastmap.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastmap.c b/fastmap.c index 49c46fd..a5cafd7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -48,7 +48,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -P perform mate SW only but skip pairing\n"); diff --git a/main.c b/main.c index dbe9dd0..1e12cfd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r132" +#define PACKAGE_VERSION "0.6.2-r270-beta" #endif static int usage() From 29e41b592c471e0dc09cbcbce32294af10cfd3aa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 23:00:51 -0500 Subject: [PATCH 139/169] bugfix: isize is off by 1 --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index ae1886a..fe0ecbe 100644 --- a/bwamem.c +++ b/bwamem.c @@ -550,7 +550,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1, str); + kputw(p0 - p1 + (p0 > p1? 1 : -1), str); } else kputw(0, str); kputc('\t', str); } else kputsn("\t*\t0\t0\t", 7, str); From 570e082b38a4b8d124afd0b690c2dc4b6a5c0fdd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 23:45:40 -0500 Subject: [PATCH 140/169] change CC back to gcc --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f1da07e..de45ff1 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CC= clang +CC= gcc CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar From 4dc982a3c72d69835682d6f930b66333c622f367 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 00:13:32 -0500 Subject: [PATCH 141/169] support interleaved fastq --- fastmap.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fastmap.c b/fastmap.c index a5cafd7..4cf92b2 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,12 +26,13 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "aCPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "paCPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'a') opt->flag |= MEM_F_ALL; + else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); @@ -42,7 +43,7 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] \n\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); @@ -51,6 +52,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -P perform mate SW only but skip pairing\n"); fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); @@ -58,6 +60,7 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + mem_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); @@ -66,10 +69,15 @@ int main_mem(int argc, char *argv[]) fp = gzdopen(fd, "r"); ks = kseq_init(fp); if (optind + 2 < argc) { - ko2 = kopen(argv[optind + 2], &fd2); - fp2 = gzdopen(fd2, "r"); - ks2 = kseq_init(fp2); - opt->flag |= MEM_F_PE; + if (opt->flag&MEM_F_PE) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); + } else { + ko2 = kopen(argv[optind + 2], &fd2); + fp2 = gzdopen(fd2, "r"); + ks2 = kseq_init(fp2); + opt->flag |= MEM_F_PE; + } } while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { int64_t size = 0; From 514563bd0adfa752e081e3c0e4c6d13277962731 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 10:54:12 -0500 Subject: [PATCH 142/169] no poor hits with -a; reduce mapq for 2nd primary --- bwamem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bwamem.c b/bwamem.c index fe0ecbe..edabd38 100644 --- a/bwamem.c +++ b/bwamem.c @@ -623,12 +623,16 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kstring_t str; str.l = str.m = 0; str.s = 0; if (a->n > 0) { + int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; if (a->a[k].secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (a->a[k].secondary >= 0 && a->a[k].score < a->a[a->a[k].secondary].score * .5) continue; mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; h.qual = a->a[k].secondary >= 0? 0 : mem_approx_mapq_se(opt, &a->a[k]); + if (k == 0) mapq0 = h.qual; + else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); From 5ead86acd35a7703b36794fbf04973e391014ea7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:18:35 -0500 Subject: [PATCH 143/169] optionally mark split hit as secondary --- bwamem.c | 14 +++++++++----- bwamem.h | 1 + fastmap.c | 33 +++++++++++++++++++-------------- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index edabd38..2d326e2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -515,13 +515,15 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate + int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag + if (sam_flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - kputw(p->flag, str); kputc('\t', str); + kputw(sam_flag, str); kputc('\t', str); kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); kputw(p->qual, str); kputc('\t', str); if (n_cigar) { @@ -626,11 +628,13 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; - if (a->a[k].secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (a->a[k].secondary >= 0 && a->a[k].score < a->a[a->a[k].secondary].score * .5) continue; - mem_alnreg2hit(&a->a[k], &h); + mem_alnreg_t *p = &a->a[k]; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + mem_alnreg2hit(p, &h); h.flag |= extra_flag; - h.qual = a->a[k].secondary >= 0? 0 : mem_approx_mapq_se(opt, &a->a[k]); + if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) + h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); if (k == 0) mapq0 = h.qual; else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); diff --git a/bwamem.h b/bwamem.h index 5cf3ac5..6ab2b01 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,6 +15,7 @@ typedef struct __smem_i smem_i; #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 +#define MEM_F_NO_MULTI 0x16 typedef struct { int a, b, q, r, w; diff --git a/fastmap.c b/fastmap.c index 4cf92b2..72aea0b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paCPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; + else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); @@ -43,19 +44,23 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); - fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); - fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); - fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); - fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); - fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); - fprintf(stderr, " -P perform mate SW only but skip pairing\n"); - fprintf(stderr, " -H hard clipping\n"); - fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); + fprintf(stderr, "Algorithm options:\n\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, "\nInput/output options:\n\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -H hard clipping\n"); + fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); fprintf(stderr, "\n"); free(opt); return 1; From 5092211d75f5088824b79ca292620799be951529 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:24:21 -0500 Subject: [PATCH 144/169] controllable scoring matrix --- fastmap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index 72aea0b..77b3d75 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,8 +26,12 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'A') opt->a = atoi(optarg); + else if (c == 'B') opt->b = atoi(optarg); + else if (c == 'O') opt->q = atoi(optarg); + else if (c == 'E') opt->r = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; @@ -52,6 +56,10 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); + fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); + fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); From e9e5ee6a3d1a0d185b74a250c7353b79646a5436 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:34:06 -0500 Subject: [PATCH 145/169] r277: updated the revision number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 1e12cfd..74980c9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r270-beta" +#define PACKAGE_VERSION "0.6.2-r277-beta" #endif static int usage() From 9957e04590da85875f6671ca317926a5e329b971 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:56:02 -0500 Subject: [PATCH 146/169] r278: don't perform too many mate-sw --- bwamem.c | 24 ++++++++++++------------ bwamem.h | 28 ++++++++++++++++------------ bwamem_pair.c | 2 +- fastmap.c | 4 +++- main.c | 2 +- 5 files changed, 33 insertions(+), 27 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2d326e2..88086ee 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,17 +14,6 @@ #include "kvec.h" #include "ksort.h" -void mem_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} - /* Theory on probability and scoring *ungapped* alignment * * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution @@ -64,12 +53,23 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; - o->pe_dir = 0<<1|1; o->pen_unpaired = 9; + o->max_matesw = 100; mem_fill_scmat(o->a, o->b, o->mat); return o; } +void mem_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} + /*************************** * SMEM iterator interface * ***************************/ diff --git a/bwamem.h b/bwamem.h index 6ab2b01..d99a9da 100644 --- a/bwamem.h +++ b/bwamem.h @@ -18,18 +18,22 @@ typedef struct __smem_i smem_i; #define MEM_F_NO_MULTI 0x16 typedef struct { - int a, b, q, r, w; - int flag; - int split_width; - int min_seed_len, max_occ, max_chain_gap; - int n_threads, chunk_size; - int pe_dir; - float mask_level; - float chain_drop_ratio; - float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor - int pen_unpaired; // phred-scaled penalty for unpaired reads - int max_ins; // maximum insert size - int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int w; // band width + int flag; // see MEM_F_* macros + int min_seed_len; // minimum seed length + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor + int split_width; // split into a seed if its occurence is smaller than this value + int max_occ; // skip a seed if its occurence is larger than this value + int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed + int n_threads; // number of threads + int chunk_size; // process chunk_size-bp sequences in a batch + float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits + float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + int pen_unpaired; // phred-scaled penalty for unpaired reads + int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value + int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { diff --git a/bwamem_pair.c b/bwamem_pair.c index 51f51c9..3ef71ea 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -246,7 +246,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n; ++j) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); mem_mark_primary_se(opt, a[0].n, a[0].a); diff --git a/fastmap.c b/fastmap.c index 77b3d75..b4f8ea8 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,8 +26,9 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -52,6 +53,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Algorithm options:\n\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); diff --git a/main.c b/main.c index 74980c9..9ef33fa 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r277-beta" +#define PACKAGE_VERSION "0.6.2-r278-beta" #endif static int usage() From 20aa848b3c4b48382dc24112e6da6bfad1c991ba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 13:00:35 -0500 Subject: [PATCH 147/169] r279: for PE mapq, consider the number of pairs If there are a lot of proper pairs, it is more likely that the best pair is wrong. --- bwamem.c | 2 +- bwamem.h | 2 +- bwamem_pair.c | 44 ++++++++++++++++++++++++-------------------- main.c | 2 +- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 88086ee..a1f25b7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -604,7 +604,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); + if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; return mapq; diff --git a/bwamem.h b/bwamem.h index d99a9da..7fc2c85 100644 --- a/bwamem.h +++ b/bwamem.h @@ -5,7 +5,7 @@ #include "bntseq.h" #include "bwa.h" -#define MEM_MAPQ_COEF 40.0 +#define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 struct __smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 3ef71ea..3fbdec7 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -167,13 +167,12 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - pair64_v v; - pair64_t o, subo; // .x: score<<32 | raw_score<<8 | hash; .y: pair - int r, i, k, y[4]; // y[] keeps the last hit - kv_init(v); + pair64_v v, u; + int r, i, k, y[4], ret; // y[] keeps the last hit + kv_init(v); kv_init(u); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { pair64_t key; @@ -185,7 +184,6 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = o.y = subo.y = 0; //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction @@ -197,7 +195,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int64_t dist; int q; double ns; - uint64_t x, pair; + pair64_t *p; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; //printf("%d: %lld\n", k, dist); @@ -206,23 +204,27 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ ns = (dist - pes[dir].avg) / pes[dir].std; q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) if (q < 0) q = 0; - pair = (uint64_t)k<<32 | i; - x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); + p = kv_pushp(pair64_t, u); + p->y = (uint64_t)k<<32 | i; + p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); - if (x > o.x) subo = o, o.x = x, o.y = pair; - else if (x > subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; } - if (o.x > 0) { - i = o.y >> 32; k = o.y << 32 >> 32; - z[v.a[i].y&1] = v.a[i].y<<32>>34; + if (u.n) { // found at least one proper pair + int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + ks_introsort_128(u.n, u.a); + i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; + z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair z[v.a[k].y&1] = v.a[k].y<<32>>34; - } - free(v.a); - *sub = subo.x>>32; - return o.x>>32; + ret = u.a[u.n-1].x >> 32; + *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; + for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) + if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; + } else ret = 0, *sub = 0, *n_sub = 0; + free(u.a); free(v.a); + return ret; } int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) @@ -233,7 +235,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); - int n = 0, i, j, z[2], o, subo; + int n = 0, i, j, z[2], o, subo, n_sub; kstring_t str; mem_alnreg_v b[2]; bwahit_t h[2]; @@ -253,7 +255,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { + if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -267,6 +269,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; subo = subo > score_un? subo : score_un; q_pe = (o - subo) * 6; + if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); + if (q_pe < 0) q_pe = 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (o > score_un) { // paired alignment is preferred diff --git a/main.c b/main.c index 9ef33fa..a0e8ec6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r278-beta" +#define PACKAGE_VERSION "0.6.2-r279-beta" #endif static int usage() From d19e834d84dc9a2659b1be665cd7ff48828c3deb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 15:40:15 -0500 Subject: [PATCH 148/169] r280: align two ends in the same thread Otherwise odd-number threads may be of different speed from even-number threads. --- bwamem.c | 11 +++++++++-- main.c | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index a1f25b7..5c274b1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -680,8 +680,15 @@ static void *worker1(void *data) { worker_t *w = (worker_t*)data; int i; - for (i = w->start; i < w->n; i += w->step) - w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + if (!(w->opt->flag&MEM_F_PE)) { + for (i = w->start; i < w->n; i += w->step) + w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower + for (i = w->start; i < w->n>>1; i += w->step) { + w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + } + } return 0; } diff --git a/main.c b/main.c index a0e8ec6..b85757e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r279-beta" +#define PACKAGE_VERSION "0.6.2-r280-beta" #endif static int usage() From 30cc8a95d1fa96ec5057e05c6dc4a7fcbe92942e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 16:34:19 -0500 Subject: [PATCH 149/169] fixed an unimportant memory leak --- kopen.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kopen.c b/kopen.c index 8c191bc..45f2713 100644 --- a/kopen.c +++ b/kopen.c @@ -312,6 +312,7 @@ int kclose(void *a) pid = waitpid(aux->pid, &status, WNOHANG); if (pid != aux->pid) kill(aux->pid, 15); } + free(aux); return 0; } diff --git a/main.c b/main.c index b85757e..749c7de 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r280-beta" +#define PACKAGE_VERSION "0.6.2-r281-beta" #endif static int usage() From 77b5b586ad9914639b1da4d2289e30711237662a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 17:29:35 -0500 Subject: [PATCH 150/169] r282: set min split_len to read length --- bwamem.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 5c274b1..2c24ba5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -199,6 +199,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i { const bwtintv_v *a; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + split_len = split_len < itr->len? split_len : itr->len; while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start diff --git a/main.c b/main.c index 749c7de..4e2f15d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r281-beta" +#define PACKAGE_VERSION "0.6.2-r282-beta" #endif static int usage() From 61dd3bf13a1d571f938ac2698999c5f34445f11f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 22:49:15 -0500 Subject: [PATCH 151/169] r283: prepare for fixing cross-ref aln --- bntseq.c | 29 +++++++++++++++++------------ bntseq.h | 1 + bwa.c | 4 ++++ main.c | 2 +- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/bntseq.c b/bntseq.c index 0286c19..972837e 100644 --- a/bntseq.c +++ b/bntseq.c @@ -288,21 +288,26 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } +int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) +{ + int left, mid, right; + if (pos_f >= bns->l_pac) return -1; + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { // binary search + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + return mid; +} + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; - if (ref_id) { - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pos_f >= bns->anns[mid].offset) { - if (mid == bns->n_seqs - 1) break; - if (pos_f < bns->anns[mid+1].offset) break; // bracketed - left = mid + 1; - } else right = mid; - } - *ref_id = mid; - } + if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; diff --git a/bntseq.h b/bntseq.h index 0425540..4061438 100644 --- a/bntseq.h +++ b/bntseq.h @@ -72,6 +72,7 @@ extern "C" { bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); diff --git a/bwa.c b/bwa.c index c8400b1..f34bd12 100644 --- a/bwa.c +++ b/bwa.c @@ -64,6 +64,10 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) return seqs; } +/***************** + * CIGAR related * + *****************/ + // Generate CIGAR when the alignment end points are known uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) { diff --git a/main.c b/main.c index 4e2f15d..4bad9ee 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r282-beta" +#define PACKAGE_VERSION "0.6.2-r283-beta" #endif static int usage() From e70c7c2a71744f5a316c84dad8016d054020d425 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 00:03:49 -0500 Subject: [PATCH 152/169] r284: amend cross-reference hit I really hate this: complex and twisted logic for a nasty scenario that almost never happens to short reads - but it may become serious when the reference genome consists of many contigs. On toy examples, the code seems to work. Don't know if it really works... --- bwa.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 1 + bwamem.c | 1 + bwamem_pair.c | 8 ++++++-- main.c | 2 +- 5 files changed, 65 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index f34bd12..e8221ca 100644 --- a/bwa.c +++ b/bwa.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bntseq.h" #include "bwa.h" #include "ksw.h" @@ -103,6 +104,61 @@ ret_gen_cigar: return cigar; } +int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) +{ + int ib, ie, is_rev; + int64_t fb, fe, mid = -1; + if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary + *qb = *qe = *rb = *re = -1; + return -1; // unable to fix + } else { + fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev); + ib = bns_pos2rid(bns, fb); + if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix + fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev); + ie = bns_pos2rid(bns, fe); + if (ie - ib > 1) { // bridge three or more references + *qb = *qe = *rb = *re = -1; + return -2; // unable to fix + } else { + int l = bns->anns[ib].offset + bns->anns[ib].len - fb; + mid = is_rev? *re - l : *rb + l; + } + } + if (mid >= 0) { + int i, score, n_cigar, y; + uint32_t *cigar; + int64_t x; + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar); + for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { + int op = cigar[i]&0xf, len = cigar[i]>>4; + if (op == 0) { + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) { // the first part is longer + if (x == mid) { // need to check the previous operation + assert(i); // mid != *rb should always stand + if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x; + else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4); + else abort(); // should not be here + } else *qe = y + (mid - x), *re = mid; + } else *qb = y + (mid - x), *rb = mid; + break; + } else x += len, y += len; + } else if (op == 1) { // insertion + y += len; + } else if (op == 2) { // deletion + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) *qe = y, *re = x; + else *qb = y, *rb = x + len; + break; + } else x += len; + } else abort(); // should not be here + } + free(cigar); + } + return 1; +} + /********************* * Full index reader * *********************/ diff --git a/bwa.h b/bwa.h index d4ca807..2d6c7bf 100644 --- a/bwa.h +++ b/bwa.h @@ -31,6 +31,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwamem.c b/bwamem.c index 2c24ba5..7c837bf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -633,6 +633,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; mem_alnreg2hit(p, &h); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re); h.flag |= extra_flag; if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); diff --git a/bwamem_pair.c b/bwamem_pair.c index 3fbdec7..9ff12b3 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -293,7 +293,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re); mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; } else goto no_pairing; @@ -301,8 +303,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]); - else h[i].rb = h[i].re = -1; + if (a[i].n) { + mem_alnreg2hit(&a[i].a[0], &h[i]); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); + } else h[i].rb = h[i].re = -1; } mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); diff --git a/main.c b/main.c index 4bad9ee..c1c232a 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r283-beta" +#define PACKAGE_VERSION "0.6.2-r284-beta" #endif static int usage() From 174fe0f1d57f54823f0ed2e83b242b4c2f5d765c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:14:19 -0500 Subject: [PATCH 153/169] code backup: less dependent on gcc optimization --- bwt.c | 17 ++++++++++------- bwt.h | 6 ++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bwt.c b/bwt.c index 7b37fe5..47b06e2 100644 --- a/bwt.c +++ b/bwt.c @@ -161,7 +161,7 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { bwtint_t l, j, x; - uint32_t *p; + uint32_t *p, tmp; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; @@ -171,9 +171,10 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) memcpy(cnt, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); j = k >> 4 << 4; - for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) + for (l = k & ~OCC_INTV_MASK, x = 0; l < j; l += 16, ++p) x += __occ_aux4(bwt, *p); - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } @@ -188,7 +189,7 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin bwt_occ4(bwt, l, cntl); } else { bwtint_t i, j, x, y; - uint32_t *p; + uint32_t *p, tmp; if (k >= bwt->primary) --k; // because $ is not in bwt if (l >= bwt->primary) --l; p = bwt_occ_intv(bwt, k); @@ -196,14 +197,16 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin p += sizeof(bwtint_t); // prepare cntk[] j = k >> 4 << 4; - for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) + for (i = k & ~OCC_INTV_MASK, x = 0; i < j; i += 16, ++p) x += __occ_aux4(bwt, *p); y = x; - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] j = l >> 4 << 4; for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); - y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + tmp = *p & ~((1U<<((~l&15)<<1)) - 1); + y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; diff --git a/bwt.h b/bwt.h index e06329a..ab5aecd 100644 --- a/bwt.h +++ b/bwt.h @@ -30,8 +30,10 @@ #include -// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line -#define OCC_INTERVAL 0x80 +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 +#define OCC_INTV_SHIFT 7 +#define OCC_INTERVAL (1LL< Date: Tue, 26 Feb 2013 11:22:24 -0500 Subject: [PATCH 154/169] code backup --- bwt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bwt.c b/bwt.c index 47b06e2..ff5a4a0 100644 --- a/bwt.c +++ b/bwt.c @@ -166,7 +166,7 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) memset(cnt, 0, 4 * sizeof(bwtint_t)); return; } - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); @@ -182,16 +182,16 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; - _k = (k >= bwt->primary)? k-1 : k; - _l = (l >= bwt->primary)? l-1 : l; - if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + _k = k - (k >= bwt->primary); + _l = l - (l >= bwt->primary); + if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { bwtint_t i, j, x, y; uint32_t *p, tmp; - if (k >= bwt->primary) --k; // because $ is not in bwt - if (l >= bwt->primary) --l; + k -= (k >= bwt->primary); // because $ is not in bwt + l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); From 264d5e42e5f5ec5f5218a88d4c8cf8d6743cc51a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:49:39 -0500 Subject: [PATCH 155/169] simplified bwt_occ4() a little --- bwt.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/bwt.c b/bwt.c index ff5a4a0..d57e2d5 100644 --- a/bwt.c +++ b/bwt.c @@ -160,8 +160,8 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { - bwtint_t l, j, x; - uint32_t *p, tmp; + bwtint_t x; + uint32_t *p, tmp, *end; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; @@ -169,10 +169,9 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); - j = k >> 4 << 4; - for (l = k & ~OCC_INTV_MASK, x = 0; l < j; l += 16, ++p) - x += __occ_aux4(bwt, *p); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop + for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); tmp = *p & ~((1U<<((~k&15)<<1)) - 1); x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; @@ -188,23 +187,22 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { - bwtint_t i, j, x, y; - uint32_t *p, tmp; + bwtint_t x, y; + uint32_t *p, tmp, *endk, *endl; k -= (k >= bwt->primary); // because $ is not in bwt l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) // prepare cntk[] - j = k >> 4 << 4; - for (i = k & ~OCC_INTV_MASK, x = 0; i < j; i += 16, ++p) - x += __occ_aux4(bwt, *p); + endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); + endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); + for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); y = x; tmp = *p & ~((1U<<((~k&15)<<1)) - 1); x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] - j = l >> 4 << 4; - for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); + for (; p < endl; ++p) y += __occ_aux4(bwt, *p); tmp = *p & ~((1U<<((~l&15)<<1)) - 1); y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); From fd6706420788f9a80996d1f22385291f9adcffed Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:51:03 -0500 Subject: [PATCH 156/169] removed an unnecessary condition --- bwt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bwt.c b/bwt.c index d57e2d5..ab0a6fc 100644 --- a/bwt.c +++ b/bwt.c @@ -324,8 +324,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match - } - if (c >= 0 && ok[c].x[2] >= min_intv && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } From 80e1137a6c27f9d2984f28400175a0e9d96eb82a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:57:36 -0500 Subject: [PATCH 157/169] move bwt_invPsi() from bwt.h to bwt.c --- bwt.c | 8 ++++++++ bwt.h | 17 ----------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/bwt.c b/bwt.c index ab0a6fc..43979ac 100644 --- a/bwt.c +++ b/bwt.c @@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt) } } +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA +{ + bwtint_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} + // bwt->bwt and bwt->occ must be precalculated void bwt_cal_sa(bwt_t *bwt, int intv) { diff --git a/bwt.h b/bwt.h index ab5aecd..e7b0f97 100644 --- a/bwt.h +++ b/bwt.h @@ -124,21 +124,4 @@ extern "C" { } #endif -// inverse Psi function -#if 0 -#define bwt_invPsi(bwt, k) \ - (((k) == (bwt)->primary)? 0 : \ - ((k) < (bwt)->primary)? \ - (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ - : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) -#else -static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) -{ - bwtint_t x = k - (k > bwt->primary); - x = bwt_B0(bwt, x); - x = bwt->L2[x] + bwt_occ(bwt, k, x); - return k == bwt->primary? 0 : x; -} -#endif - #endif From aa92c720b5ddf2d6107a537ed8f7bb6d94fbc1bb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 12:09:28 -0500 Subject: [PATCH 158/169] cleanup bwt_occ() --- bwt.c | 11 +++++------ kopen.c | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/bwt.c b/bwt.c index 43979ac..4ee9ea8 100644 --- a/bwt.c +++ b/bwt.c @@ -105,21 +105,20 @@ static inline int __occ_aux(uint64_t y, int c) bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) { - bwtint_t n, l, j; - uint32_t *p; + bwtint_t n; + uint32_t *p, *end; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; if (k == (bwtint_t)(-1)) return 0; - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt // retrieve Occ at k/OCC_INTERVAL n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; p += sizeof(bwtint_t); // jump to the start of the first BWT cell // calculate Occ up to the last k/32 - j = k >> 5 << 5; - for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) - n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); + for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); // calculate Occ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); diff --git a/kopen.c b/kopen.c index 45f2713..8887932 100644 --- a/kopen.c +++ b/kopen.c @@ -203,7 +203,7 @@ ftp_open_end: static char **cmd2argv(const char *cmd) { int i, beg, end, argc; - char **argv, *p, *q, *str; + char **argv, *str; end = strlen(cmd); for (i = end - 1; i >= 0; --i) if (!isspace(cmd[i])) break; @@ -217,7 +217,7 @@ static char **cmd2argv(const char *cmd) argv = (char**)calloc(argc + 2, sizeof(void*)); argv[0] = str = (char*)calloc(end - beg + 1, 1); strncpy(argv[0], cmd + beg, end - beg); - for (i = argc = 1, q = p = str; i < end - beg; ++i) + for (i = argc = 1; i < end - beg; ++i) if (isspace(str[i])) str[i] = 0; else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; return argv; From bfb2583d7f52a021bb1ae5d7564f3cf1014a9a0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 12:10:19 -0500 Subject: [PATCH 159/169] r291: summary - bwt.c micro optimization --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index c1c232a..5930a78 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r284-beta" +#define PACKAGE_VERSION "0.6.2-r291-beta" #endif static int usage() From c6b226d71971cfdacb9792ac5cb278ed3615094b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 12:49:48 -0500 Subject: [PATCH 160/169] r292: fixed a very stupid bug on CLI I was thinking 0x10 or 16, but wrote 0x16... --- bwamem.h | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.h b/bwamem.h index 7fc2c85..8a7c7b8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,7 +15,7 @@ typedef struct __smem_i smem_i; #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 -#define MEM_F_NO_MULTI 0x16 +#define MEM_F_NO_MULTI 0x10 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/main.c b/main.c index 5930a78..f7ec3d7 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r291-beta" +#define PACKAGE_VERSION "0.6.2-r292-beta" #endif static int usage() From 619ac4f93d6e0049aff19a22d39685260a03cc28 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:03:35 -0500 Subject: [PATCH 161/169] r293: bugfix - wrong RG type in SAM output --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7c837bf..e75a3c4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -584,7 +584,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id[0]) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); free(cigar); diff --git a/main.c b/main.c index f7ec3d7..473bcd3 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r292-beta" +#define PACKAGE_VERSION "0.6.2-r293-beta" #endif static int usage() From 32f2d60a2e6406c3114a7bd7f6a11f7413dfcb0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:14:33 -0500 Subject: [PATCH 162/169] r294: bugfix - -M not working --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index e75a3c4..8d8402b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -517,7 +517,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag - if (sam_flag&0x10000) sam_flag |= 0x100; + if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) diff --git a/main.c b/main.c index 473bcd3..80c9fb1 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r293-beta" +#define PACKAGE_VERSION "0.6.2-r294-beta" #endif static int usage() From 98787f0ae064241baeebf8cd394913a2b1cc2587 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:36:01 -0500 Subject: [PATCH 163/169] r295: generate NM --- bwa.c | 22 ++++++++++++++++++---- bwa.h | 2 +- bwamem.c | 5 +++-- fastmap.c | 3 ++- main.c | 2 +- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/bwa.c b/bwa.c index e8221ca..aef2ec8 100644 --- a/bwa.c +++ b/bwa.c @@ -70,13 +70,13 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) *****************/ // Generate CIGAR when the alignment end points are known -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { uint32_t *cigar = 0; uint8_t tmp, *rseq; int i, w; int64_t rlen; - *n_cigar = 0; + *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range @@ -95,6 +95,20 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa w += abs(rlen - l_query); // NW alignment *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + {// compute NM + int k, x, y, n_mm = 0, n_gap = 0; + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + int op = cigar[k]&0xf; + int len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) + if (query[x + i] != rseq[y + i]) ++n_mm; + x += len; y += len; + } else if (op == 1) x += len, n_gap += len; + else if (op == 2) y += len, n_gap += len; + } + *NM = n_mm + n_gap; + } if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; @@ -126,10 +140,10 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, } } if (mid >= 0) { - int i, score, n_cigar, y; + int i, score, n_cigar, y, NM; uint32_t *cigar; int64_t x; - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar); + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { int op = cigar[i]&0xf, len = cigar[i]>>4; if (op == 0) { diff --git a/bwa.h b/bwa.h index 2d6c7bf..81d40e0 100644 --- a/bwa.h +++ b/bwa.h @@ -30,7 +30,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); diff --git a/bwamem.c b/bwamem.c index 8d8402b..156e9b7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -496,7 +496,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0; + int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; uint32_t *cigar = 0; int64_t pos; bwahit_t ptmp, *p = &ptmp; @@ -519,7 +519,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); @@ -582,6 +582,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons str->s[str->l] = 0; } else kputc('*', str); } + if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } diff --git a/fastmap.c b/fastmap.c index b4f8ea8..81ce665 100644 --- a/fastmap.c +++ b/fastmap.c @@ -47,6 +47,7 @@ int main_mem(int argc, char *argv[]) if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); } + if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -94,7 +95,7 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if (!copy_comment) for (i = 0; i < n; ++i) { diff --git a/main.c b/main.c index 80c9fb1..a33830b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r294-beta" +#define PACKAGE_VERSION "0.6.2-r295-beta" #endif static int usage() From 54ab3bbec74658c65ac1be48c631c1b128ea0225 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 14:35:03 -0500 Subject: [PATCH 164/169] Dropped solid2fastq.pl SOLiD is not supported any more. --- solid2fastq.pl | 111 ------------------------------------------------- 1 file changed, 111 deletions(-) delete mode 100755 solid2fastq.pl diff --git a/solid2fastq.pl b/solid2fastq.pl deleted file mode 100755 index c60ad81..0000000 --- a/solid2fastq.pl +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/perl -w - -# Author: lh3 -# Note: Ideally, this script should be written in C. It is a bit slow at present. -# Also note that this script is different from the one contained in MAQ. - -use strict; -use warnings; -use Getopt::Std; - -my %opts; -my $version = '0.1.4'; -my $usage = qq{ -Usage: solid2fastq.pl - -Note: is the string showed in the `# Title:' line of a - ".csfasta" read file. Then F3.csfasta is read sequence - file and F3_QV.qual is the quality file. If - R3.csfasta is present, this script assumes reads are - paired; otherwise reads will be regarded as single-end. - - The read name will be :panel_x_y/[12] with `1' for R3 - tag and `2' for F3. Usually you may want to use short - to save diskspace. Long also causes troubles to maq. - -}; - -getopts('', \%opts); -die($usage) if (@ARGV != 2); -my ($title, $pre) = @ARGV; -my (@fhr, @fhw); -my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); -my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; -if ($is_paired) { # paired end - for (0 .. 3) { - my $fn = "$title$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo - open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; - open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; - my (@df, @dr); - @df = &read1(1); @dr = &read1(2); - while (@df && @dr) { - if ($df[0] eq $dr[0]) { # mate pair - print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; - @df = &read1(1); @dr = &read1(2); - } else { - if ($df[0] le $dr[0]) { - print {$fhw[2]} $df[1]; - @df = &read1(1); - } else { - print {$fhw[2]} $dr[1]; - @dr = &read1(2); - } - } - } - if (@df) { - print {$fhw[2]} $df[1]; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - } - if (@dr) { - print {$fhw[2]} $dr[1]; - while (@dr = &read1(2, $fhr[2], $fhr[3])) { - print {$fhw[2]} $dr[1]; - } - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[$_]) for (0 .. $#fhw); -} else { # single end - for (0 .. 1) { - my $fn = "$title$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; - my @df; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[2]); -} - -sub read1 { - my $i = shift(@_); - my $j = ($i-1)<<1; - my ($key, $seq); - my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); - while (<$fhs>) { - my $t = <$fhq>; - if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { - $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines - die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); - my $name = "$pre:$1_$2_$3/$i"; - $_ = substr(<$fhs>, 2); - tr/0123./ACGTN/; - my $s = $_; - $_ = <$fhq>; - s/-1\b/0/eg; - s/^(\d+)\s*//; - s/(\d+)\s*/chr($1+33)/eg; - $seq = qq/\@$name\n$s+\n$_\n/; - last; - } - } - return defined($seq)? ($key, $seq) : (); -} From acd1ab607b8048485e871df139294236f646e679 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 16:26:46 -0500 Subject: [PATCH 165/169] r297: reduce wasteful SW extension This is particularly important for long sequences --- bwamem.c | 20 ++++++++++++++++---- main.c | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 156e9b7..4471682 100644 --- a/bwamem.c +++ b/bwamem.c @@ -648,7 +648,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { - int i, j; + int i, j, k; mem_chain_v chn; mem_alnreg_v regs, tmp; for (i = 0; i < l_seq; ++i) @@ -658,9 +658,21 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, &chn.a[i], &tmp); - for (j = 0; j < tmp.n; ++j) - kv_push(mem_alnreg_t, regs, tmp.a[j]); + mem_chain_t *p = &chn.a[i]; + for (j = 0; j < regs.n; ++j) { // check if all the seeds are contained in alnreg found previously + mem_alnreg_t *q = ®s.a[j]; + for (k = 0; k < p->n; ++k) { + mem_seed_t *s = &p->seeds[k]; + if (!(s->qbeg >= q->qb && s->qbeg + s->len <= q->qe && s->rbeg >= q->rb && s->rbeg + s->len <= q->re)) + break; // stop if seed is not contained + } + if (k == p->n) break; // if all seeds are contained, stop + } + if (j == regs.n) { + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &tmp); + for (j = 0; j < tmp.n; ++j) + kv_push(mem_alnreg_t, regs, tmp.a[j]); + } free(chn.a[i].seeds); } free(chn.a); free(tmp.a); diff --git a/main.c b/main.c index a33830b..f566493 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r295-beta" +#define PACKAGE_VERSION "0.6.2-r297-beta" #endif static int usage() From ee80fb8bd07451f0eba4d7dc9f76d507ed325a13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 22:55:44 -0500 Subject: [PATCH 166/169] Test each seed to see if extension is needed The old version wastefully extends many seeds contained in an aligned region found before. While this wastes little time for short reads, it becomes a serious defect for long query sequences. This is an attempt to fix this problem, but more tuning are needed. --- bwamem.c | 83 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4471682..8b2216e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,6 +13,7 @@ #include "ksw.h" #include "kvec.h" #include "ksort.h" +#include "utils.h" /* Theory on probability and scoring *ungapped* alignment * @@ -417,6 +418,21 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ +static const char LogTable256[256] = { +#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), + LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) +}; +#undef LT + +static inline int ilog2(uint32_t v) +{ + register uint32_t t, tt; + if ((tt = (v >> 16))) return (t = (tt >> 8)) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; + return (t = (v >> 8)) ? 8 + LogTable256[t] : LogTable256[v]; +} + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -429,8 +445,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; + uint64_t *srt; - av->n = 0; + if (c->n == 0) return; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -446,11 +463,31 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); if (rlen != rmax[1] - rmax[0]) return; - for (k = 0; k < c->n;) { + srt = malloc(c->n * 8); + for (i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].len<<32 | i; + ks_introsort_64(c->n, srt); + + for (k = c->n - 1; k >= 0; --k) { mem_alnreg_t *a; + s = &c->seeds[(uint32_t)srt[k]]; + + for (i = 0; i < av->n; ++i) { // test whether extension has been made before + mem_alnreg_t *p = &av->a[i]; + int64_t rd; + int qd, w; + if (s->qbeg < p->qb || s->qbeg + s->len > p->qe || s->rbeg < p->rb || s->rbeg + s->len > p->re) continue; + qd = s->qbeg - p->qb; + rd = s->rbeg - p->rb; + w = ilog2(p->re - p->rb)<<1; // heuristic band width: small size for short hits + w = w < opt->w? w : opt->w; + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + } + if (i < av->n) continue; + a = kv_pushp(mem_alnreg_t, *av); - s = &c->seeds[k]; memset(a, 0, sizeof(mem_alnreg_t)); + if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle; @@ -464,7 +501,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; - if (s->qbeg + s->len != l_query) { // right extension of the first seed + if (s->qbeg + s->len != l_query) { // right extension int qle, tle, qe, re; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; @@ -472,21 +509,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } - // jump to the next seed that: 1) has no >7bp overlap with the previous seed, or 2) is not fully contained in the alignment - for (i = k + 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - if ((t-1)->rbeg + (t-1)->len >= t->rbeg + 7 || (t-1)->qbeg + (t-1)->len >= t->qbeg + 7) break; - if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; - } - k = i; } - free(rseq); + free(srt); free(rseq); } /***************************** @@ -648,34 +679,24 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { - int i, j, k; + int i; mem_chain_v chn; - mem_alnreg_v regs, tmp; - for (i = 0; i < l_seq; ++i) + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); - kv_init(regs); kv_init(tmp); + + kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; - for (j = 0; j < regs.n; ++j) { // check if all the seeds are contained in alnreg found previously - mem_alnreg_t *q = ®s.a[j]; - for (k = 0; k < p->n; ++k) { - mem_seed_t *s = &p->seeds[k]; - if (!(s->qbeg >= q->qb && s->qbeg + s->len <= q->qe && s->rbeg >= q->rb && s->rbeg + s->len <= q->re)) - break; // stop if seed is not contained - } - if (k == p->n) break; // if all seeds are contained, stop - } - if (j == regs.n) { - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &tmp); - for (j = 0; j < tmp.n; ++j) - kv_push(mem_alnreg_t, regs, tmp.a[j]); - } + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } - free(chn.a); free(tmp.a); + free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; } From 0b533385efff5960506a7832e07196221c476c9f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 00:29:11 -0500 Subject: [PATCH 167/169] r299: better way to exclude seed --- bwamem.c | 35 +++++++++++++---------------------- main.c | 2 +- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 8b2216e..5c526f4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -418,25 +418,11 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ -static const char LogTable256[256] = { -#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), - LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) -}; -#undef LT - -static inline int ilog2(uint32_t v) -{ - register uint32_t t, tt; - if ((tt = (v >> 16))) return (t = (tt >> 8)) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; - return (t = (v >> 8)) ? 8 + LogTable256[t] : LogTable256[v]; -} - static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); - return l > 1? l : 1; + l = l > 1? l : 1; + return l < opt->w<<1? l : opt->w<<1; } void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) @@ -475,13 +461,18 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < av->n; ++i) { // test whether extension has been made before mem_alnreg_t *p = &av->a[i]; int64_t rd; - int qd, w; - if (s->qbeg < p->qb || s->qbeg + s->len > p->qe || s->rbeg < p->rb || s->rbeg + s->len > p->re) continue; - qd = s->qbeg - p->qb; - rd = s->rbeg - p->rb; - w = ilog2(p->re - p->rb)<<1; // heuristic band width: small size for short hits - w = w < opt->w? w : opt->w; + int qd, w, max_gap; + if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained + // qd: distance ahead of the seed on query; rd: on reference + qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; + max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed + w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + // similar to the previous four lines, but this time we look at the region behind + qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); + max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < opt->w? max_gap : opt->w; + if (qd - rd < w && rd - qd < w) break; } if (i < av->n) continue; diff --git a/main.c b/main.c index f566493..12fbf20 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r297-beta" +#define PACKAGE_VERSION "0.6.2-r299-beta" #endif static int usage() From 65e099df347d96a845f1161b665e51e352bef6a4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 00:37:17 -0500 Subject: [PATCH 168/169] r300: fixed an out-of-boundary bug in rare case --- bwamem.c | 6 ++++++ main.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 5c526f4..86b3e7a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -445,6 +445,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rmax[1] = rmax[1] > e? rmax[1] : e; if (t->len > max) max = t->len; } + rmax[0] = rmax[0] > 0? rmax[0] : 0; + rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; + if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side + if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac; + else rmax[0] = l_pac; + } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); if (rlen != rmax[1] - rmax[0]) return; diff --git a/main.c b/main.c index 12fbf20..636f818 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r299-beta" +#define PACKAGE_VERSION "0.6.2-r300-beta" #endif static int usage() From b621d3ae38a06484ec4931c944421012b53f775a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 00:42:19 -0500 Subject: [PATCH 169/169] r301: left-align indels Don't know why the change is working... --- ksw.c | 8 ++++---- main.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ksw.c b/ksw.c index 742fec9..4cbcb32 100644 --- a/ksw.c +++ b/ksw.c @@ -492,10 +492,10 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, uint8_t d; // direction p->h = h1; h += q[j]; - d = h > e? 0 : 1; - h = h > e? h : e; - d = h > f? d : 2; - h = h > f? h : f; + d = h >= e? 0 : 1; + h = h >= e? h : e; + d = h >= f? d : 2; + h = h >= f? h : f; h1 = h; h -= gapoe; e -= gape; diff --git a/main.c b/main.c index 636f818..7648310 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r300-beta" +#define PACKAGE_VERSION "0.6.2-r301-beta" #endif static int usage()