routine to get subsequence from 2-bit pac
This commit is contained in:
parent
7ab4b3321f
commit
00e5302219
27
bntseq.c
27
bntseq.c
|
|
@ -321,3 +321,30 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
|||
}
|
||||
return nn;
|
||||
}
|
||||
|
||||
static inline void get_seq_core(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, uint8_t *seq)
|
||||
{
|
||||
int64_t k, l = 0;
|
||||
if (beg >= l_pac) { // reverse strand
|
||||
int64_t beg_f = (l_pac<<1) - 1 - end;
|
||||
int64_t end_f = (l_pac<<1) - 1 - beg;
|
||||
for (k = end_f; k > beg_f; --k)
|
||||
seq[l++] = 3 - _get_pac(pac, k);
|
||||
} else { // forward strand
|
||||
for (k = beg; k < end; ++k)
|
||||
seq[l++] = _get_pac(pac, k);
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
|
||||
{
|
||||
uint8_t *seq;
|
||||
if (end > l_pac<<1) end = l_pac<<1;
|
||||
*len = end - beg;
|
||||
seq = malloc(end - beg);
|
||||
if (beg < l_pac && end > l_pac) {
|
||||
get_seq_core(l_pac, pac, beg, l_pac, seq);
|
||||
get_seq_core(l_pac, pac, l_pac, end, seq + (l_pac - beg));
|
||||
} else get_seq_core(l_pac, pac, beg, end, seq);
|
||||
return seq;
|
||||
}
|
||||
|
|
|
|||
1
bntseq.h
1
bntseq.h
|
|
@ -72,6 +72,7 @@ extern "C" {
|
|||
void bns_destroy(bntseq_t *bns);
|
||||
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
48
bwamem.c
48
bwamem.c
|
|
@ -3,11 +3,12 @@
|
|||
#include <stdio.h>
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "bntseq.h"
|
||||
|
||||
memopt_t *mem_opt_init()
|
||||
mem_opt_t *mem_opt_init()
|
||||
{
|
||||
memopt_t *o;
|
||||
o = calloc(1, sizeof(memopt_t));
|
||||
mem_opt_t *o;
|
||||
o = calloc(1, sizeof(mem_opt_t));
|
||||
o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100;
|
||||
o->min_seed_len = 17;
|
||||
o->max_occ = 10;
|
||||
|
|
@ -95,12 +96,12 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len)
|
|||
#include "kbtree.h"
|
||||
|
||||
#define chain_cmp(a, b) ((a).pos - (b).pos)
|
||||
KBTREE_INIT(chn, memchain1_t, chain_cmp)
|
||||
KBTREE_INIT(chn, mem_chain1_t, chain_cmp)
|
||||
|
||||
static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p)
|
||||
static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p)
|
||||
{
|
||||
int64_t qend, rend, x, y;
|
||||
const memseed_t *last = &c->seeds[c->n-1];
|
||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||
qend = last->qbeg + last->len;
|
||||
rend = last->rbeg + last->len;
|
||||
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
||||
|
|
@ -110,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *
|
|||
if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
|
||||
if (c->n == c->m) {
|
||||
c->m <<= 1;
|
||||
c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t));
|
||||
c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
|
||||
}
|
||||
c->seeds[c->n++] = *p;
|
||||
return 1;
|
||||
|
|
@ -118,7 +119,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *
|
|||
return 0; // request to add a new chain
|
||||
}
|
||||
|
||||
static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
||||
static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
||||
{
|
||||
const bwtintv_v *a;
|
||||
while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM
|
||||
|
|
@ -129,8 +130,8 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it
|
|||
int64_t k;
|
||||
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
|
||||
for (k = 0; k < p->x[2]; ++k) {
|
||||
memchain1_t tmp, *lower, *upper;
|
||||
memseed_t s;
|
||||
mem_chain1_t tmp, *lower, *upper;
|
||||
mem_seed_t s;
|
||||
int to_add = 0;
|
||||
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
s.qbeg = p->info>>32;
|
||||
|
|
@ -141,7 +142,7 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it
|
|||
} else to_add = 1;
|
||||
if (to_add) { // add the seed as a new chain
|
||||
tmp.n = 1; tmp.m = 4;
|
||||
tmp.seeds = calloc(tmp.m, sizeof(memseed_t));
|
||||
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
|
||||
tmp.seeds[0] = s;
|
||||
kb_putp(chn, tree, &tmp);
|
||||
}
|
||||
|
|
@ -150,13 +151,13 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it
|
|||
}
|
||||
}
|
||||
|
||||
memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
||||
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
||||
{
|
||||
memchain_t chain;
|
||||
mem_chain_t chain;
|
||||
smem_i *itr;
|
||||
kbtree_t(chn) *tree;
|
||||
|
||||
memset(&chain, 0, sizeof(memchain_t));
|
||||
memset(&chain, 0, sizeof(mem_chain_t));
|
||||
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
||||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||||
itr = smem_itr_init(bwt);
|
||||
|
|
@ -164,13 +165,28 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8
|
|||
mem_insert_seed(opt, tree, itr);
|
||||
|
||||
chain.m = kb_size(tree); chain.n = 0;
|
||||
chain.chains = malloc(chain.m * sizeof(memchain1_t));
|
||||
chain.chains = malloc(chain.m * sizeof(mem_chain1_t));
|
||||
|
||||
#define traverse_func(p_) (chain.chains[chain.n++] = *(p_))
|
||||
__kb_traverse(memchain1_t, tree, traverse_func);
|
||||
__kb_traverse(mem_chain1_t, tree, traverse_func);
|
||||
#undef traverse_func
|
||||
|
||||
smem_itr_destroy(itr);
|
||||
kb_destroy(chn, tree);
|
||||
return chain;
|
||||
}
|
||||
|
||||
mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c)
|
||||
{
|
||||
mem_aln_t a;
|
||||
int i, j;
|
||||
int64_t len;
|
||||
for (i = 0; i < c->n; ++i) {
|
||||
mem_seed_t *s = &c->seeds[i];
|
||||
uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len);
|
||||
for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n');
|
||||
for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n');
|
||||
free(seq);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
|
|
|||
23
bwamem.h
23
bwamem.h
|
|
@ -9,23 +9,29 @@ typedef struct __smem_i smem_i;
|
|||
typedef struct {
|
||||
int64_t rbeg;
|
||||
int32_t qbeg, len;
|
||||
} memseed_t;
|
||||
} mem_seed_t;
|
||||
|
||||
typedef struct {
|
||||
int a, b, q, r, w;
|
||||
int min_seed_len, max_occ, max_chain_gap;
|
||||
} memopt_t;
|
||||
} mem_opt_t;
|
||||
|
||||
typedef struct {
|
||||
int n, m;
|
||||
int64_t pos;
|
||||
memseed_t *seeds;
|
||||
} memchain1_t;
|
||||
mem_seed_t *seeds;
|
||||
} mem_chain1_t;
|
||||
|
||||
typedef struct {
|
||||
int n, m;
|
||||
memchain1_t *chains;
|
||||
} memchain_t;
|
||||
mem_chain1_t *chains;
|
||||
} mem_chain_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t pos;
|
||||
int n_cigar, len, score;
|
||||
uint32_t *cigar;
|
||||
} mem_aln_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -36,9 +42,10 @@ void smem_itr_destroy(smem_i *itr);
|
|||
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
||||
const bwtintv_v *smem_next(smem_i *itr, int split_len);
|
||||
|
||||
memopt_t *mem_opt_init(void);
|
||||
mem_opt_t *mem_opt_init(void);
|
||||
|
||||
memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
||||
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
||||
mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
10
fastmap.c
10
fastmap.c
|
|
@ -13,12 +13,13 @@ extern unsigned char nst_nt4_table[256];
|
|||
|
||||
int main_mem(int argc, char *argv[])
|
||||
{
|
||||
memopt_t *opt;
|
||||
mem_opt_t *opt;
|
||||
bwt_t *bwt;
|
||||
bntseq_t *bns;
|
||||
int i, j, c;
|
||||
gzFile *fp;
|
||||
kseq_t *seq;
|
||||
uint8_t *pac = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "")) >= 0) {
|
||||
|
|
@ -40,15 +41,18 @@ int main_mem(int argc, char *argv[])
|
|||
bwt_restore_sa(tmp, bwt);
|
||||
free(tmp);
|
||||
bns = bns_restore(argv[optind]);
|
||||
pac = calloc(bns->l_pac/4+1, 1);
|
||||
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
}
|
||||
while (kseq_read(seq) >= 0) {
|
||||
memchain_t chain;
|
||||
mem_chain_t chain;
|
||||
printf(">%s\n", seq->name.s);
|
||||
for (i = 0; i < seq->seq.l; ++i)
|
||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s);
|
||||
for (i = 0; i < chain.n; ++i) {
|
||||
memchain1_t *p = &chain.chains[i];
|
||||
mem_chain1_t *p = &chain.chains[i];
|
||||
mem_chain2aln(bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p);
|
||||
printf("%d\t%d", i, p->n);
|
||||
for (j = 0; j < p->n; ++j) {
|
||||
bwtint_t pos;
|
||||
|
|
|
|||
Loading…
Reference in New Issue