From 00e5302219a595c34ffee2b05a0c7e78883ac2a7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 16:39:50 -0500 Subject: [PATCH] routine to get subsequence from 2-bit pac --- bntseq.c | 27 +++++++++++++++++++++++++++ bntseq.h | 1 + bwamem.c | 48 ++++++++++++++++++++++++++++++++---------------- bwamem.h | 23 +++++++++++++++-------- fastmap.c | 10 +++++++--- 5 files changed, 82 insertions(+), 27 deletions(-) diff --git a/bntseq.c b/bntseq.c index adcd2d7..18abb2b 100644 --- a/bntseq.c +++ b/bntseq.c @@ -321,3 +321,30 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) } return nn; } + +static inline void get_seq_core(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, uint8_t *seq) +{ + int64_t k, l = 0; + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } +} + +uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) +{ + uint8_t *seq; + if (end > l_pac<<1) end = l_pac<<1; + *len = end - beg; + seq = malloc(end - beg); + if (beg < l_pac && end > l_pac) { + get_seq_core(l_pac, pac, beg, l_pac, seq); + get_seq_core(l_pac, pac, l_pac, end, seq + (l_pac - beg)); + } else get_seq_core(l_pac, pac, beg, end, seq); + return seq; +} diff --git a/bntseq.h b/bntseq.h index 843db64..d4096b4 100644 --- a/bntseq.h +++ b/bntseq.h @@ -72,6 +72,7 @@ extern "C" { void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); #ifdef __cplusplus } diff --git a/bwamem.c b/bwamem.c index 62df4e5..ef09fd5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -3,11 +3,12 @@ #include #include "bwamem.h" #include "kvec.h" +#include "bntseq.h" -memopt_t *mem_opt_init() +mem_opt_t *mem_opt_init() { - memopt_t *o; - o = calloc(1, sizeof(memopt_t)); + mem_opt_t *o; + o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; @@ -95,12 +96,12 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) -KBTREE_INIT(chn, memchain1_t, chain_cmp) +KBTREE_INIT(chn, mem_chain1_t, chain_cmp) -static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) +static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; - const memseed_t *last = &c->seeds[c->n-1]; + const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) @@ -110,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; - c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); + c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; return 1; @@ -118,7 +119,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 0; // request to add a new chain } -static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM @@ -129,8 +130,8 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it int64_t k; if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { - memchain1_t tmp, *lower, *upper; - memseed_t s; + mem_chain1_t tmp, *lower, *upper; + mem_seed_t s; int to_add = 0; s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; @@ -141,7 +142,7 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; kb_putp(chn, tree, &tmp); } @@ -150,13 +151,13 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it } } -memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { - memchain_t chain; + mem_chain_t chain; smem_i *itr; kbtree_t(chn) *tree; - memset(&chain, 0, sizeof(memchain_t)); + memset(&chain, 0, sizeof(mem_chain_t)); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); @@ -164,13 +165,28 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8 mem_insert_seed(opt, tree, itr); chain.m = kb_size(tree); chain.n = 0; - chain.chains = malloc(chain.m * sizeof(memchain1_t)); + chain.chains = malloc(chain.m * sizeof(mem_chain1_t)); #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) - __kb_traverse(memchain1_t, tree, traverse_func); + __kb_traverse(mem_chain1_t, tree, traverse_func); #undef traverse_func smem_itr_destroy(itr); kb_destroy(chn, tree); return chain; } + +mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +{ + mem_aln_t a; + int i, j; + int64_t len; + for (i = 0; i < c->n; ++i) { + mem_seed_t *s = &c->seeds[i]; + uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len); + for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n'); + for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); + free(seq); + } + return a; +} diff --git a/bwamem.h b/bwamem.h index eb79586..0ebd2eb 100644 --- a/bwamem.h +++ b/bwamem.h @@ -9,23 +9,29 @@ typedef struct __smem_i smem_i; typedef struct { int64_t rbeg; int32_t qbeg, len; -} memseed_t; +} mem_seed_t; typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; -} memopt_t; +} mem_opt_t; typedef struct { int n, m; int64_t pos; - memseed_t *seeds; -} memchain1_t; + mem_seed_t *seeds; +} mem_chain1_t; typedef struct { int n, m; - memchain1_t *chains; -} memchain_t; + mem_chain1_t *chains; +} mem_chain_t; + +typedef struct { + int64_t pos; + int n_cigar, len, score; + uint32_t *cigar; +} mem_aln_t; #ifdef __cplusplus extern "C" { @@ -36,9 +42,10 @@ void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); const bwtintv_v *smem_next(smem_i *itr, int split_len); -memopt_t *mem_opt_init(void); +mem_opt_t *mem_opt_init(void); -memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 42122b4..d8a0eca 100644 --- a/fastmap.c +++ b/fastmap.c @@ -13,12 +13,13 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { - memopt_t *opt; + mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; int i, j, c; gzFile *fp; kseq_t *seq; + uint8_t *pac = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -40,15 +41,18 @@ int main_mem(int argc, char *argv[]) bwt_restore_sa(tmp, bwt); free(tmp); bns = bns_restore(argv[optind]); + pac = calloc(bns->l_pac/4+1, 1); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } while (kseq_read(seq) >= 0) { - memchain_t chain; + mem_chain_t chain; printf(">%s\n", seq->name.s); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { - memchain1_t *p = &chain.chains[i]; + mem_chain1_t *p = &chain.chains[i]; + mem_chain2aln(bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos;