r823: mappy to index a single sequence

This commit is contained in:
Heng Li 2018-08-05 20:57:05 -04:00
parent 5ab6538757
commit b0f39a1a61
5 changed files with 37 additions and 10 deletions

View File

@ -372,7 +372,9 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
uint64_t sum_len = 0;
mm128_v a = {0,0,0};
mm_idx_t *mi;
khash_t(str) *h;
int i, flag = 0;
if (n <= 0) return 0;
for (i = 0; i < n; ++i) // get the total length
sum_len += strlen(seq[i]);
@ -383,13 +385,17 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
mi->n_seq = n;
mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4);
mi->h = h = kh_init(str);
for (i = 0, sum_len = 0; i < n; ++i) {
const char *s = seq[i];
mm_idx_seq_t *p = &mi->seq[i];
uint32_t j;
if (name && name[i]) {
int absent;
p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1);
strcpy(p->name, name[i]);
kh_put(str, h, p->name, &absent);
assert(absent);
}
p->offset = sum_len;
p->len = strlen(s);

2
main.c
View File

@ -10,7 +10,7 @@
#include "getopt.h"
#endif
#define MM_VERSION "2.11-r822-dirty"
#define MM_VERSION "2.11-r823-dirty"
#ifdef __linux__
#include <sys/resource.h>

View File

@ -137,4 +137,16 @@ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int e
return s;
}
static mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int len)
{
const char *fake_name = "N/A";
char *s;
mm_idx_t *mi;
s = (char*)calloc(len + 1, 1);
memcpy(s, seq, len);
mi = mm_idx_str(w, k, is_hpc, bucket_bits, 1, (const char**)&s, (const char**)&fake_name);
free(s);
return mi;
}
#endif

View File

@ -110,6 +110,7 @@ cdef extern from "cmappy.h":
void mm_free_reg1(mm_reg1_t *r)
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l)
ctypedef struct kstring_t:
unsigned l, m

View File

@ -112,7 +112,7 @@ cdef class Aligner:
cdef cmappy.mm_idxopt_t idx_opt
cdef cmappy.mm_mapopt_t map_opt
def __cinit__(self, fn_idx_in, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None):
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None):
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
if preset is not None:
cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
@ -126,17 +126,24 @@ cdef class Aligner:
if bw is not None: self.map_opt.bw = bw
if best_n is not None: self.map_opt.best_n = best_n
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
if extra_flags is not None: self.map_opt.flag |= extra_flags
cdef cmappy.mm_idx_reader_t *r;
if fn_idx_out is None:
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
if seq is None:
if fn_idx_out is None:
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
else:
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, fn_idx_out)
if r is not NULL:
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
cmappy.mm_idx_reader_close(r)
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
cmappy.mm_idx_index_name(self._idx)
else:
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, fn_idx_out)
if r is not NULL:
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
cmappy.mm_idx_reader_close(r)
self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq))
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
cmappy.mm_idx_index_name(self._idx)
self.map_opt.mid_occ = 1000 # don't filter high-occ seeds
def __dealloc__(self):
if self._idx is not NULL:
@ -145,7 +152,7 @@ cdef class Aligner:
def __bool__(self):
return (self._idx != NULL)
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None):
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
cdef cmappy.mm_reg1_t *regs
cdef cmappy.mm_hitpy_t h
cdef ThreadBuffer b
@ -157,6 +164,7 @@ cdef class Aligner:
map_opt = self.map_opt
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
if extra_flags is not None: map_opt.flag |= extra_flags
if self._idx is NULL: return None
if buf is None: b = ThreadBuffer()