start to write CLI
This commit is contained in:
parent
a09db69037
commit
5dc398cdef
55
bwamem.c
55
bwamem.c
|
|
@ -29,6 +29,10 @@ mem_opt_t *mem_opt_init()
|
|||
o->max_chain_gap = 10000;
|
||||
o->mask_level = 0.50;
|
||||
o->chain_drop_ratio = 0.50;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->pe_dir = 0<<1|1;
|
||||
o->is_pe = 0;
|
||||
mem_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
|
@ -119,9 +123,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len)
|
|||
#include "kbtree.h"
|
||||
|
||||
#define chain_cmp(a, b) ((a).pos - (b).pos)
|
||||
KBTREE_INIT(chn, mem_chain1_t, chain_cmp)
|
||||
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
||||
|
||||
static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p)
|
||||
static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
|
||||
{
|
||||
int64_t qend, rend, x, y;
|
||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||
|
|
@ -153,7 +157,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
|
|||
int64_t k;
|
||||
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
|
||||
for (k = 0; k < p->x[2]; ++k) {
|
||||
mem_chain1_t tmp, *lower, *upper;
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
mem_seed_t s;
|
||||
int to_add = 0;
|
||||
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
|
|
@ -174,24 +178,23 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
|
|||
}
|
||||
}
|
||||
|
||||
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
||||
{
|
||||
mem_chain_t chain;
|
||||
mem_chain_v chain;
|
||||
smem_i *itr;
|
||||
kbtree_t(chn) *tree;
|
||||
|
||||
memset(&chain, 0, sizeof(mem_chain_t));
|
||||
kv_init(chain);
|
||||
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
||||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||||
itr = smem_itr_init(bwt);
|
||||
smem_set_query(itr, len, seq);
|
||||
mem_insert_seed(opt, tree, itr);
|
||||
|
||||
chain.m = kb_size(tree); chain.n = 0;
|
||||
chain.chains = malloc(chain.m * sizeof(mem_chain1_t));
|
||||
kv_resize(mem_chain_t, chain, kb_size(tree));
|
||||
|
||||
#define traverse_func(p_) (chain.chains[chain.n++] = *(p_))
|
||||
__kb_traverse(mem_chain1_t, tree, traverse_func);
|
||||
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
|
||||
__kb_traverse(mem_chain_t, tree, traverse_func);
|
||||
#undef traverse_func
|
||||
|
||||
smem_itr_destroy(itr);
|
||||
|
|
@ -211,14 +214,14 @@ typedef struct {
|
|||
#define flt_lt(a, b) ((a).w > (b).w)
|
||||
KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
|
||||
|
||||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
|
||||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
|
||||
{
|
||||
flt_aux_t *a;
|
||||
int i, j, n;
|
||||
if (n_chn <= 1) return n_chn; // no need to filter
|
||||
a = malloc(sizeof(flt_aux_t) * n_chn);
|
||||
for (i = 0; i < n_chn; ++i) {
|
||||
mem_chain1_t *c = &chains[i];
|
||||
mem_chain_t *c = &chains[i];
|
||||
int w = 0;
|
||||
for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps
|
||||
a[i].beg = c->seeds[0].qbeg;
|
||||
|
|
@ -227,13 +230,13 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
|
|||
}
|
||||
ks_introsort(mem_flt, n_chn, a);
|
||||
{ // reorder chains such that the best chain appears first
|
||||
mem_chain1_t *swap;
|
||||
swap = malloc(sizeof(mem_chain1_t) * n_chn);
|
||||
mem_chain_t *swap;
|
||||
swap = malloc(sizeof(mem_chain_t) * n_chn);
|
||||
for (i = 0; i < n_chn; ++i) {
|
||||
swap[i] = *((mem_chain1_t*)a[i].p);
|
||||
swap[i] = *((mem_chain_t*)a[i].p);
|
||||
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
|
||||
}
|
||||
memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn);
|
||||
memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
|
||||
free(swap);
|
||||
}
|
||||
for (i = 1, n = 1; i < n_chn; ++i) {
|
||||
|
|
@ -252,14 +255,14 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
|
|||
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
|
||||
}
|
||||
for (i = 0; i < n; ++i) { // mark chains to be kept
|
||||
mem_chain1_t *c = (mem_chain1_t*)a[i].p;
|
||||
mem_chain_t *c = (mem_chain_t*)a[i].p;
|
||||
if (c->n > 0) c->n = -c->n;
|
||||
c = (mem_chain1_t*)a[i].p2;
|
||||
c = (mem_chain_t*)a[i].p2;
|
||||
if (c && c->n > 0) c->n = -c->n;
|
||||
}
|
||||
free(a);
|
||||
for (i = 0; i < n_chn; ++i) { // free discarded chains
|
||||
mem_chain1_t *c = &chains[i];
|
||||
mem_chain_t *c = &chains[i];
|
||||
if (c->n >= 0) {
|
||||
free(c->seeds);
|
||||
c->n = c->m = 0;
|
||||
|
|
@ -310,7 +313,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
|||
return l > 1? l : 1;
|
||||
}
|
||||
|
||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a)
|
||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a)
|
||||
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
|
||||
int i, j, qbeg;
|
||||
int64_t rlen, rbeg, rmax[2], tmp;
|
||||
|
|
@ -415,6 +418,12 @@ ret_gen_cigar:
|
|||
return cigar;
|
||||
}
|
||||
|
||||
/****************
|
||||
* Sequence I/O *
|
||||
****************/
|
||||
static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s)
|
||||
{
|
||||
}
|
||||
|
||||
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
|
||||
{
|
||||
int i;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
23
bwamem.h
23
bwamem.h
|
|
@ -2,6 +2,9 @@
|
|||
#define BWAMEM_H_
|
||||
|
||||
#include "bwt.h"
|
||||
#include "bntseq.h"
|
||||
#include "bseq.h"
|
||||
#include "kvec.h"
|
||||
|
||||
struct __smem_i;
|
||||
typedef struct __smem_i smem_i;
|
||||
|
|
@ -14,19 +17,16 @@ typedef struct {
|
|||
typedef struct {
|
||||
int a, b, q, r, w;
|
||||
int min_seed_len, max_occ, max_chain_gap;
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
int n_threads, chunk_size;
|
||||
int pe_dir, is_pe;
|
||||
float mask_level, chain_drop_ratio;
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
} mem_opt_t;
|
||||
|
||||
typedef struct {
|
||||
int n, m;
|
||||
int64_t pos;
|
||||
mem_seed_t *seeds;
|
||||
} mem_chain1_t;
|
||||
|
||||
typedef struct {
|
||||
int n, m;
|
||||
mem_chain1_t *chains;
|
||||
} mem_chain_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -34,6 +34,9 @@ typedef struct {
|
|||
int score, qb, qe, sub;
|
||||
} mem_alnreg_t;
|
||||
|
||||
typedef kvec_t(mem_chain_t) mem_chain_v;
|
||||
typedef kvec_t(mem_alnreg_t) mem_alnreg_v;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -46,11 +49,13 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len);
|
|||
mem_opt_t *mem_opt_init(void);
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
|
||||
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
||||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains);
|
||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a);
|
||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
||||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains);
|
||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a);
|
||||
uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
|
||||
|
||||
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
35
fastmap.c
35
fastmap.c
|
|
@ -6,6 +6,7 @@
|
|||
#include "bwt.h"
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "bseq.h"
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
|
|
@ -16,10 +17,11 @@ int main_mem(int argc, char *argv[])
|
|||
mem_opt_t *opt;
|
||||
bwt_t *bwt;
|
||||
bntseq_t *bns;
|
||||
int i, j, c;
|
||||
gzFile fp;
|
||||
kseq_t *seq;
|
||||
int i, c, n;
|
||||
gzFile fp, fp2 = 0;
|
||||
kseq_t *ks, *ks2 = 0;
|
||||
uint8_t *pac = 0;
|
||||
bseq1_t *seqs;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "")) >= 0) {
|
||||
|
|
@ -32,8 +34,6 @@ int main_mem(int argc, char *argv[])
|
|||
return 1;
|
||||
}
|
||||
mem_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
fp = gzopen(argv[optind + 1], "r");
|
||||
seq = kseq_init(fp);
|
||||
{ // load the packed sequences, BWT and SA
|
||||
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
|
||||
strcat(strcpy(tmp, argv[optind]), ".bwt");
|
||||
|
|
@ -45,6 +45,22 @@ int main_mem(int argc, char *argv[])
|
|||
pac = calloc(bns->l_pac/4+1, 1);
|
||||
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
}
|
||||
|
||||
fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
|
||||
ks = kseq_init(fp);
|
||||
if (optind + 2 < argc) {
|
||||
fp2 = gzopen(argv[optind + 2], "r");
|
||||
ks2 = kseq_init(fp);
|
||||
opt->is_pe = 1;
|
||||
}
|
||||
while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
|
||||
mem_process_seqs(opt, bwt, bns, pac, n, seqs);
|
||||
for (i = 0; i < n; ++i) {
|
||||
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual);
|
||||
}
|
||||
free(seqs);
|
||||
}
|
||||
/*
|
||||
while (kseq_read(seq) >= 0) {
|
||||
mem_chain_t chain;
|
||||
printf(">%s\n", seq->name.s);
|
||||
|
|
@ -71,12 +87,17 @@ int main_mem(int argc, char *argv[])
|
|||
for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds);
|
||||
free(chain.chains);
|
||||
}
|
||||
*/
|
||||
|
||||
free(pac); free(opt);
|
||||
free(opt); free(pac);
|
||||
bns_destroy(bns);
|
||||
bwt_destroy(bwt);
|
||||
kseq_destroy(seq);
|
||||
kseq_destroy(ks);
|
||||
gzclose(fp);
|
||||
if (ks2) {
|
||||
kseq_destroy(ks2);
|
||||
gzclose(fp2);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue