r718: retrieve sequence from the index
This commit is contained in:
parent
29ed675ee5
commit
24a4808826
40
index.c
40
index.c
|
|
@ -20,6 +20,8 @@
|
|||
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
|
||||
typedef khash_t(idx) idxhash_t;
|
||||
|
||||
KHASH_MAP_INIT_STR(str, uint32_t)
|
||||
|
||||
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
|
||||
|
||||
typedef struct mm_idx_bucket_s {
|
||||
|
|
@ -29,15 +31,6 @@ typedef struct mm_idx_bucket_s {
|
|||
void *h; // hash table indexing _p_ and minimizers appearing once
|
||||
} mm_idx_bucket_t;
|
||||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt)
|
||||
{
|
||||
memset(opt, 0, sizeof(mm_idxopt_t));
|
||||
opt->k = 15, opt->w = 10, opt->flag = 0;
|
||||
opt->bucket_bits = 14;
|
||||
opt->mini_batch_size = 50000000;
|
||||
opt->batch_size = 4000000000ULL;
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
|
||||
{
|
||||
mm_idx_t *mi;
|
||||
|
|
@ -54,6 +47,7 @@ void mm_idx_destroy(mm_idx_t *mi)
|
|||
{
|
||||
int i;
|
||||
if (mi == 0) return;
|
||||
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
|
||||
for (i = 0; i < 1<<mi->b; ++i) {
|
||||
free(mi->B[i].p);
|
||||
free(mi->B[i].a.a);
|
||||
|
|
@ -109,6 +103,34 @@ void mm_idx_stat(const mm_idx_t *mi)
|
|||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum);
|
||||
}
|
||||
|
||||
int mm_idx_index_name(mm_idx_t *mi)
|
||||
{
|
||||
khash_t(str) *h;
|
||||
uint32_t i;
|
||||
int has_dup = 0, absent;
|
||||
if (mi->h) return 0;
|
||||
h = kh_init(str);
|
||||
for (i = 0; i < mi->n_seq; ++i) {
|
||||
khint_t k;
|
||||
k = kh_put(str, h, mi->seq[i].name, &absent);
|
||||
if (absent) kh_val(h, k) = i;
|
||||
else has_dup = 1;
|
||||
}
|
||||
mi->h = h;
|
||||
if (has_dup && mm_verbose >= 2)
|
||||
fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
|
||||
return has_dup;
|
||||
}
|
||||
|
||||
int mm_idx_name2id(const mm_idx_t *mi, const char *name)
|
||||
{
|
||||
khash_t(str) *h = (khash_t(str)*)mi->h;
|
||||
khint_t k;
|
||||
if (h == 0) return -2;
|
||||
k = kh_get(str, h, name);
|
||||
return k == kh_end(h)? -1 : kh_val(h, k);
|
||||
}
|
||||
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
||||
{
|
||||
uint64_t i, st1, en1;
|
||||
|
|
|
|||
2
main.c
2
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.8-r711-dirty"
|
||||
#define MM_VERSION "2.8-r718-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ typedef struct {
|
|||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
struct mm_idx_bucket_s *B; // index (hidden)
|
||||
void *km;
|
||||
void *km, *h;
|
||||
} mm_idx_t;
|
||||
|
||||
// minimap2 alignment
|
||||
|
|
@ -297,6 +297,11 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
|
|||
|
||||
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
|
||||
|
||||
// query sequence name and sequence in the minimap2 index
|
||||
int mm_idx_index_name(mm_idx_t *mi);
|
||||
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
|
||||
// deprecated APIs for backward compatibility
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
||||
|
|
|
|||
1
mmpriv.h
1
mmpriv.h
|
|
@ -63,7 +63,6 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt);
|
||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
|
||||
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,15 @@
|
|||
#include <stdio.h>
|
||||
#include "mmpriv.h"
|
||||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt)
|
||||
{
|
||||
memset(opt, 0, sizeof(mm_idxopt_t));
|
||||
opt->k = 15, opt->w = 10, opt->flag = 0;
|
||||
opt->bucket_bits = 14;
|
||||
opt->mini_batch_size = 50000000;
|
||||
opt->batch_size = 4000000000ULL;
|
||||
}
|
||||
|
||||
void mm_mapopt_init(mm_mapopt_t *opt)
|
||||
{
|
||||
memset(opt, 0, sizeof(mm_mapopt_t));
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ The following Python script demonstrates the key functionality of mappy:
|
|||
import mappy as mp
|
||||
a = mp.Aligner("test/MT-human.fa") # load or build index
|
||||
if not a: raise Exception("ERROR: failed to load/build index")
|
||||
s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
|
||||
print(mp.revcomp(s)) # reverse complement
|
||||
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
|
||||
for hit in a.map(seq): # traverse alignments
|
||||
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
||||
|
|
@ -87,7 +89,15 @@ This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
|||
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
|
||||
performs paired-end alignment, assuming the two ends are in the FR orientation.
|
||||
Alignments of the two ends can be distinguished by the :code:`read_num` field
|
||||
(see below).
|
||||
(see Class mappy.Alignment below).
|
||||
|
||||
.. code:: python
|
||||
|
||||
mappy.Aligner.seq(name, start=0, end=0x7fffffff)
|
||||
|
||||
This method retrieves a (sub)sequence from the index and returns it as a Python
|
||||
string. :code:`None` is returned if :code:`name` is not present in the index or
|
||||
the start/end coordinates are invalid.
|
||||
|
||||
Class mappy.Alignment
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -144,11 +154,12 @@ Miscellaneous Functions
|
|||
|
||||
.. code:: python
|
||||
|
||||
mappy.fastx_read(fn)
|
||||
mappy.fastx_read(fn, read_comment=False)
|
||||
|
||||
This generator function opens a FASTA/FASTQ file and *yields* a
|
||||
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
|
||||
optionally gzip'd.
|
||||
optionally gzip'd. If :code:`read_comment` is True, this generator yields
|
||||
a :code:`(name,seq,qual,comment)` tuple instead.
|
||||
|
||||
.. code:: python
|
||||
|
||||
|
|
|
|||
|
|
@ -112,4 +112,22 @@ static inline char *mappy_revcomp(int len, const uint8_t *seq)
|
|||
return rev;
|
||||
}
|
||||
|
||||
static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
|
||||
{
|
||||
int i, rid;
|
||||
char *s;
|
||||
*len = 0;
|
||||
rid = mm_idx_name2id(mi, name);
|
||||
if (rid < 0) return 0;
|
||||
if (st >= mi->seq[i].len || st >= en) return 0;
|
||||
if (en < 0 || en > mi->seq[i].len)
|
||||
en = mi->seq[i].len;
|
||||
s = (char*)malloc(en - st + 1);
|
||||
*len = mm_idx_getseq(mi, rid, st, en, s);
|
||||
for (i = 0; i < *len; ++i)
|
||||
s[i] = "ACGTN"[(uint8_t)s[i]];
|
||||
s[*len] = 0;
|
||||
return s;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ cdef extern from "minimap.h":
|
|||
mm_idx_seq_t *seq
|
||||
uint32_t *S
|
||||
mm_idx_bucket_t *B
|
||||
void *km
|
||||
void *km, *h
|
||||
|
||||
ctypedef struct mm_idx_reader_t:
|
||||
pass
|
||||
|
|
@ -69,6 +69,8 @@ cdef extern from "minimap.h":
|
|||
void mm_idx_destroy(mm_idx_t *mi)
|
||||
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
||||
|
||||
int mm_idx_index_name(mm_idx_t *mi)
|
||||
|
||||
#
|
||||
# Mapping (key struct defined in cmappy.h below)
|
||||
#
|
||||
|
|
@ -99,6 +101,7 @@ cdef extern from "cmappy.h":
|
|||
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
||||
void mm_free_reg1(mm_reg1_t *r)
|
||||
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
|
||||
char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
|
||||
|
||||
ctypedef struct kstring_t:
|
||||
unsigned l, m
|
||||
|
|
|
|||
|
|
@ -123,6 +123,7 @@ cdef class Aligner:
|
|||
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
|
||||
cmappy.mm_idx_reader_close(r)
|
||||
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
||||
cmappy.mm_idx_index_name(self._idx)
|
||||
|
||||
def __dealloc__(self):
|
||||
if self._idx is not NULL:
|
||||
|
|
@ -140,8 +141,13 @@ cdef class Aligner:
|
|||
if self._idx is NULL: return None
|
||||
if buf is None: b = ThreadBuffer()
|
||||
else: b = buf
|
||||
if seq2 is None: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), NULL, &n_regs, b._b, &self.map_opt)
|
||||
else: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), str.encode(seq2), &n_regs, b._b, &self.map_opt)
|
||||
|
||||
_seq = seq if isinstance(seq, bytes) else seq.encode()
|
||||
if seq2 is None:
|
||||
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &self.map_opt)
|
||||
else:
|
||||
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
|
||||
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &self.map_opt)
|
||||
|
||||
for i in range(n_regs):
|
||||
cmappy.mm_reg2hitpy(self._idx, ®s[i], &h)
|
||||
|
|
@ -153,7 +159,24 @@ cdef class Aligner:
|
|||
cmappy.mm_free_reg1(®s[i])
|
||||
free(regs)
|
||||
|
||||
def fastx_read(fn):
|
||||
def seq(self, str name, int start=0, int end=0x7fffffff):
|
||||
cdef int l
|
||||
cdef char *s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
|
||||
if l == 0: return None
|
||||
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
||||
free(s)
|
||||
return r
|
||||
|
||||
@property
|
||||
def k(self): return self._idx.k
|
||||
|
||||
@property
|
||||
def w(self): return self._idx.w
|
||||
|
||||
@property
|
||||
def n_seq(self): return self._idx.n_seq
|
||||
|
||||
def fastx_read(fn, read_comment=False):
|
||||
cdef cmappy.kseq_t *ks
|
||||
ks = cmappy.mm_fastx_open(str.encode(fn))
|
||||
if ks is NULL: return None
|
||||
|
|
@ -162,7 +185,12 @@ def fastx_read(fn):
|
|||
else: qual = None
|
||||
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
|
||||
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
|
||||
yield name, seq, qual
|
||||
if read_comment:
|
||||
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
|
||||
else: comment = None
|
||||
yield name, seq, qual, comment
|
||||
else:
|
||||
yield name, seq, qual
|
||||
cmappy.mm_fastx_close(ks)
|
||||
|
||||
def revcomp(seq):
|
||||
|
|
|
|||
Loading…
Reference in New Issue