start to write CLI

This commit is contained in:
Heng Li 2013-02-07 13:13:43 -05:00
parent a09db69037
commit 5dc398cdef
3 changed files with 74 additions and 39 deletions

View File

@ -29,6 +29,10 @@ mem_opt_t *mem_opt_init()
o->max_chain_gap = 10000; o->max_chain_gap = 10000;
o->mask_level = 0.50; o->mask_level = 0.50;
o->chain_drop_ratio = 0.50; o->chain_drop_ratio = 0.50;
o->chunk_size = 10000000;
o->n_threads = 1;
o->pe_dir = 0<<1|1;
o->is_pe = 0;
mem_fill_scmat(o->a, o->b, o->mat); mem_fill_scmat(o->a, o->b, o->mat);
return o; return o;
} }
@ -119,9 +123,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len)
#include "kbtree.h" #include "kbtree.h"
#define chain_cmp(a, b) ((a).pos - (b).pos) #define chain_cmp(a, b) ((a).pos - (b).pos)
KBTREE_INIT(chn, mem_chain1_t, chain_cmp) KBTREE_INIT(chn, mem_chain_t, chain_cmp)
static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p) static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
{ {
int64_t qend, rend, x, y; int64_t qend, rend, x, y;
const mem_seed_t *last = &c->seeds[c->n-1]; const mem_seed_t *last = &c->seeds[c->n-1];
@ -153,7 +157,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
int64_t k; int64_t k;
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
for (k = 0; k < p->x[2]; ++k) { for (k = 0; k < p->x[2]; ++k) {
mem_chain1_t tmp, *lower, *upper; mem_chain_t tmp, *lower, *upper;
mem_seed_t s; mem_seed_t s;
int to_add = 0; int to_add = 0;
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
@ -174,24 +178,23 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
} }
} }
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
{ {
mem_chain_t chain; mem_chain_v chain;
smem_i *itr; smem_i *itr;
kbtree_t(chn) *tree; kbtree_t(chn) *tree;
memset(&chain, 0, sizeof(mem_chain_t)); kv_init(chain);
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
tree = kb_init(chn, KB_DEFAULT_SIZE); tree = kb_init(chn, KB_DEFAULT_SIZE);
itr = smem_itr_init(bwt); itr = smem_itr_init(bwt);
smem_set_query(itr, len, seq); smem_set_query(itr, len, seq);
mem_insert_seed(opt, tree, itr); mem_insert_seed(opt, tree, itr);
chain.m = kb_size(tree); chain.n = 0; kv_resize(mem_chain_t, chain, kb_size(tree));
chain.chains = malloc(chain.m * sizeof(mem_chain1_t));
#define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) #define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse(mem_chain1_t, tree, traverse_func); __kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func #undef traverse_func
smem_itr_destroy(itr); smem_itr_destroy(itr);
@ -211,14 +214,14 @@ typedef struct {
#define flt_lt(a, b) ((a).w > (b).w) #define flt_lt(a, b) ((a).w > (b).w)
KSORT_INIT(mem_flt, flt_aux_t, flt_lt) KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
{ {
flt_aux_t *a; flt_aux_t *a;
int i, j, n; int i, j, n;
if (n_chn <= 1) return n_chn; // no need to filter if (n_chn <= 1) return n_chn; // no need to filter
a = malloc(sizeof(flt_aux_t) * n_chn); a = malloc(sizeof(flt_aux_t) * n_chn);
for (i = 0; i < n_chn; ++i) { for (i = 0; i < n_chn; ++i) {
mem_chain1_t *c = &chains[i]; mem_chain_t *c = &chains[i];
int w = 0; int w = 0;
for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps
a[i].beg = c->seeds[0].qbeg; a[i].beg = c->seeds[0].qbeg;
@ -227,13 +230,13 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
} }
ks_introsort(mem_flt, n_chn, a); ks_introsort(mem_flt, n_chn, a);
{ // reorder chains such that the best chain appears first { // reorder chains such that the best chain appears first
mem_chain1_t *swap; mem_chain_t *swap;
swap = malloc(sizeof(mem_chain1_t) * n_chn); swap = malloc(sizeof(mem_chain_t) * n_chn);
for (i = 0; i < n_chn; ++i) { for (i = 0; i < n_chn; ++i) {
swap[i] = *((mem_chain1_t*)a[i].p); swap[i] = *((mem_chain_t*)a[i].p);
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
} }
memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn); memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
free(swap); free(swap);
} }
for (i = 1, n = 1; i < n_chn; ++i) { for (i = 1, n = 1; i < n_chn; ++i) {
@ -252,14 +255,14 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
} }
for (i = 0; i < n; ++i) { // mark chains to be kept for (i = 0; i < n; ++i) { // mark chains to be kept
mem_chain1_t *c = (mem_chain1_t*)a[i].p; mem_chain_t *c = (mem_chain_t*)a[i].p;
if (c->n > 0) c->n = -c->n; if (c->n > 0) c->n = -c->n;
c = (mem_chain1_t*)a[i].p2; c = (mem_chain_t*)a[i].p2;
if (c && c->n > 0) c->n = -c->n; if (c && c->n > 0) c->n = -c->n;
} }
free(a); free(a);
for (i = 0; i < n_chn; ++i) { // free discarded chains for (i = 0; i < n_chn; ++i) { // free discarded chains
mem_chain1_t *c = &chains[i]; mem_chain_t *c = &chains[i];
if (c->n >= 0) { if (c->n >= 0) {
free(c->seeds); free(c->seeds);
c->n = c->m = 0; c->n = c->m = 0;
@ -310,7 +313,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
return l > 1? l : 1; return l > 1? l : 1;
} }
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a)
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
int i, j, qbeg; int i, j, qbeg;
int64_t rlen, rbeg, rmax[2], tmp; int64_t rlen, rbeg, rmax[2], tmp;
@ -415,6 +418,12 @@ ret_gen_cigar:
return cigar; return cigar;
} }
/**************** static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s)
* Sequence I/O * {
****************/ }
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
{
int i;
return 0;
}

View File

@ -2,6 +2,9 @@
#define BWAMEM_H_ #define BWAMEM_H_
#include "bwt.h" #include "bwt.h"
#include "bntseq.h"
#include "bseq.h"
#include "kvec.h"
struct __smem_i; struct __smem_i;
typedef struct __smem_i smem_i; typedef struct __smem_i smem_i;
@ -14,19 +17,16 @@ typedef struct {
typedef struct { typedef struct {
int a, b, q, r, w; int a, b, q, r, w;
int min_seed_len, max_occ, max_chain_gap; int min_seed_len, max_occ, max_chain_gap;
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset int n_threads, chunk_size;
int pe_dir, is_pe;
float mask_level, chain_drop_ratio; float mask_level, chain_drop_ratio;
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t; } mem_opt_t;
typedef struct { typedef struct {
int n, m; int n, m;
int64_t pos; int64_t pos;
mem_seed_t *seeds; mem_seed_t *seeds;
} mem_chain1_t;
typedef struct {
int n, m;
mem_chain1_t *chains;
} mem_chain_t; } mem_chain_t;
typedef struct { typedef struct {
@ -34,6 +34,9 @@ typedef struct {
int score, qb, qe, sub; int score, qb, qe, sub;
} mem_alnreg_t; } mem_alnreg_t;
typedef kvec_t(mem_chain_t) mem_chain_v;
typedef kvec_t(mem_alnreg_t) mem_alnreg_v;
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -46,11 +49,13 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len);
mem_opt_t *mem_opt_init(void); mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]); void mem_fill_scmat(int a, int b, int8_t mat[25]);
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains); int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains);
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a); void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a);
uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -6,6 +6,7 @@
#include "bwt.h" #include "bwt.h"
#include "bwamem.h" #include "bwamem.h"
#include "kvec.h" #include "kvec.h"
#include "bseq.h"
#include "kseq.h" #include "kseq.h"
KSEQ_DECLARE(gzFile) KSEQ_DECLARE(gzFile)
@ -16,10 +17,11 @@ int main_mem(int argc, char *argv[])
mem_opt_t *opt; mem_opt_t *opt;
bwt_t *bwt; bwt_t *bwt;
bntseq_t *bns; bntseq_t *bns;
int i, j, c; int i, c, n;
gzFile fp; gzFile fp, fp2 = 0;
kseq_t *seq; kseq_t *ks, *ks2 = 0;
uint8_t *pac = 0; uint8_t *pac = 0;
bseq1_t *seqs;
opt = mem_opt_init(); opt = mem_opt_init();
while ((c = getopt(argc, argv, "")) >= 0) { while ((c = getopt(argc, argv, "")) >= 0) {
@ -32,8 +34,6 @@ int main_mem(int argc, char *argv[])
return 1; return 1;
} }
mem_fill_scmat(opt->a, opt->b, opt->mat); mem_fill_scmat(opt->a, opt->b, opt->mat);
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA { // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1); char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt"); strcat(strcpy(tmp, argv[optind]), ".bwt");
@ -45,6 +45,22 @@ int main_mem(int argc, char *argv[])
pac = calloc(bns->l_pac/4+1, 1); pac = calloc(bns->l_pac/4+1, 1);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
} }
fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
ks = kseq_init(fp);
if (optind + 2 < argc) {
fp2 = gzopen(argv[optind + 2], "r");
ks2 = kseq_init(fp);
opt->is_pe = 1;
}
while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
mem_process_seqs(opt, bwt, bns, pac, n, seqs);
for (i = 0; i < n; ++i) {
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual);
}
free(seqs);
}
/*
while (kseq_read(seq) >= 0) { while (kseq_read(seq) >= 0) {
mem_chain_t chain; mem_chain_t chain;
printf(">%s\n", seq->name.s); printf(">%s\n", seq->name.s);
@ -71,12 +87,17 @@ int main_mem(int argc, char *argv[])
for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds);
free(chain.chains); free(chain.chains);
} }
*/
free(pac); free(opt); free(opt); free(pac);
bns_destroy(bns); bns_destroy(bns);
bwt_destroy(bwt); bwt_destroy(bwt);
kseq_destroy(seq); kseq_destroy(ks);
gzclose(fp); gzclose(fp);
if (ks2) {
kseq_destroy(ks2);
gzclose(fp2);
}
return 0; return 0;
} }