start to write CLI

This commit is contained in:
Heng Li 2013-02-07 13:13:43 -05:00
parent a09db69037
commit 5dc398cdef
3 changed files with 74 additions and 39 deletions

View File

@ -29,6 +29,10 @@ mem_opt_t *mem_opt_init()
o->max_chain_gap = 10000;
o->mask_level = 0.50;
o->chain_drop_ratio = 0.50;
o->chunk_size = 10000000;
o->n_threads = 1;
o->pe_dir = 0<<1|1;
o->is_pe = 0;
mem_fill_scmat(o->a, o->b, o->mat);
return o;
}
@ -119,9 +123,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len)
#include "kbtree.h"
#define chain_cmp(a, b) ((a).pos - (b).pos)
KBTREE_INIT(chn, mem_chain1_t, chain_cmp)
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p)
static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
{
int64_t qend, rend, x, y;
const mem_seed_t *last = &c->seeds[c->n-1];
@ -153,7 +157,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
int64_t k;
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
for (k = 0; k < p->x[2]; ++k) {
mem_chain1_t tmp, *lower, *upper;
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
int to_add = 0;
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
@ -174,24 +178,23 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
}
}
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
{
mem_chain_t chain;
mem_chain_v chain;
smem_i *itr;
kbtree_t(chn) *tree;
memset(&chain, 0, sizeof(mem_chain_t));
kv_init(chain);
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
tree = kb_init(chn, KB_DEFAULT_SIZE);
itr = smem_itr_init(bwt);
smem_set_query(itr, len, seq);
mem_insert_seed(opt, tree, itr);
chain.m = kb_size(tree); chain.n = 0;
chain.chains = malloc(chain.m * sizeof(mem_chain1_t));
kv_resize(mem_chain_t, chain, kb_size(tree));
#define traverse_func(p_) (chain.chains[chain.n++] = *(p_))
__kb_traverse(mem_chain1_t, tree, traverse_func);
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
smem_itr_destroy(itr);
@ -211,14 +214,14 @@ typedef struct {
#define flt_lt(a, b) ((a).w > (b).w)
KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
{
flt_aux_t *a;
int i, j, n;
if (n_chn <= 1) return n_chn; // no need to filter
a = malloc(sizeof(flt_aux_t) * n_chn);
for (i = 0; i < n_chn; ++i) {
mem_chain1_t *c = &chains[i];
mem_chain_t *c = &chains[i];
int w = 0;
for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps
a[i].beg = c->seeds[0].qbeg;
@ -227,13 +230,13 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
}
ks_introsort(mem_flt, n_chn, a);
{ // reorder chains such that the best chain appears first
mem_chain1_t *swap;
swap = malloc(sizeof(mem_chain1_t) * n_chn);
mem_chain_t *swap;
swap = malloc(sizeof(mem_chain_t) * n_chn);
for (i = 0; i < n_chn; ++i) {
swap[i] = *((mem_chain1_t*)a[i].p);
swap[i] = *((mem_chain_t*)a[i].p);
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
}
memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn);
memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
free(swap);
}
for (i = 1, n = 1; i < n_chn; ++i) {
@ -252,14 +255,14 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains)
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
}
for (i = 0; i < n; ++i) { // mark chains to be kept
mem_chain1_t *c = (mem_chain1_t*)a[i].p;
mem_chain_t *c = (mem_chain_t*)a[i].p;
if (c->n > 0) c->n = -c->n;
c = (mem_chain1_t*)a[i].p2;
c = (mem_chain_t*)a[i].p2;
if (c && c->n > 0) c->n = -c->n;
}
free(a);
for (i = 0; i < n_chn; ++i) { // free discarded chains
mem_chain1_t *c = &chains[i];
mem_chain_t *c = &chains[i];
if (c->n >= 0) {
free(c->seeds);
c->n = c->m = 0;
@ -310,7 +313,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
return l > 1? l : 1;
}
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a)
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a)
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
int i, j, qbeg;
int64_t rlen, rbeg, rmax[2], tmp;
@ -415,6 +418,12 @@ ret_gen_cigar:
return cigar;
}
/****************
* Sequence I/O *
****************/
static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s)
{
}
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
{
int i;
return 0;
}

View File

@ -2,6 +2,9 @@
#define BWAMEM_H_
#include "bwt.h"
#include "bntseq.h"
#include "bseq.h"
#include "kvec.h"
struct __smem_i;
typedef struct __smem_i smem_i;
@ -14,19 +17,16 @@ typedef struct {
typedef struct {
int a, b, q, r, w;
int min_seed_len, max_occ, max_chain_gap;
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
int n_threads, chunk_size;
int pe_dir, is_pe;
float mask_level, chain_drop_ratio;
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
typedef struct {
int n, m;
int64_t pos;
mem_seed_t *seeds;
} mem_chain1_t;
typedef struct {
int n, m;
mem_chain1_t *chains;
} mem_chain_t;
typedef struct {
@ -34,6 +34,9 @@ typedef struct {
int score, qb, qe, sub;
} mem_alnreg_t;
typedef kvec_t(mem_chain_t) mem_chain_v;
typedef kvec_t(mem_alnreg_t) mem_alnreg_v;
#ifdef __cplusplus
extern "C" {
#endif
@ -46,11 +49,13 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len);
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains);
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a);
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains);
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a);
uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
#ifdef __cplusplus
}
#endif

View File

@ -6,6 +6,7 @@
#include "bwt.h"
#include "bwamem.h"
#include "kvec.h"
#include "bseq.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
@ -16,10 +17,11 @@ int main_mem(int argc, char *argv[])
mem_opt_t *opt;
bwt_t *bwt;
bntseq_t *bns;
int i, j, c;
gzFile fp;
kseq_t *seq;
int i, c, n;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
uint8_t *pac = 0;
bseq1_t *seqs;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "")) >= 0) {
@ -32,8 +34,6 @@ int main_mem(int argc, char *argv[])
return 1;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
@ -45,6 +45,22 @@ int main_mem(int argc, char *argv[])
pac = calloc(bns->l_pac/4+1, 1);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
}
fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
ks = kseq_init(fp);
if (optind + 2 < argc) {
fp2 = gzopen(argv[optind + 2], "r");
ks2 = kseq_init(fp);
opt->is_pe = 1;
}
while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
mem_process_seqs(opt, bwt, bns, pac, n, seqs);
for (i = 0; i < n; ++i) {
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual);
}
free(seqs);
}
/*
while (kseq_read(seq) >= 0) {
mem_chain_t chain;
printf(">%s\n", seq->name.s);
@ -71,12 +87,17 @@ int main_mem(int argc, char *argv[])
for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds);
free(chain.chains);
}
*/
free(pac); free(opt);
free(opt); free(pac);
bns_destroy(bns);
bwt_destroy(bwt);
kseq_destroy(seq);
kseq_destroy(ks);
gzclose(fp);
if (ks2) {
kseq_destroy(ks2);
gzclose(fp2);
}
return 0;
}