r573: prepare to generalize index
This commit is contained in:
parent
3b518271ee
commit
2f463b1db0
8
align.c
8
align.c
|
|
@ -222,7 +222,7 @@ static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x
|
|||
|
||||
static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], mm128_t *a, int32_t *r, int32_t *q)
|
||||
{
|
||||
if (mi->is_hpc) {
|
||||
if (mi->flag & MM_I_HPC) {
|
||||
const uint8_t *qseq = qseq0[a->x>>63];
|
||||
int i, c;
|
||||
*q = (int32_t)a->y;
|
||||
|
|
@ -351,14 +351,14 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
int32_t rs1, qs1, re1, qe1;
|
||||
int8_t mat[25];
|
||||
|
||||
if (is_sr) assert(!mi->is_hpc); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
|
||||
if (is_sr) assert(!(mi->flag & MM_I_HPC)); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
|
||||
|
||||
r2->cnt = 0;
|
||||
if (r->cnt == 0) return;
|
||||
ksw_gen_simple_mat(5, mat, opt->a, opt->b);
|
||||
bw = (int)(opt->bw * 1.5 + 1.);
|
||||
|
||||
if (is_sr && !mi->is_hpc) {
|
||||
if (is_sr && !(mi->flag & MM_I_HPC)) {
|
||||
mm_max_stretch(opt, r, a, &as1, &cnt1);
|
||||
rs = (int32_t)a[as1].x + 1 - (int32_t)(a[as1].y>>32&0xff);
|
||||
qs = (int32_t)a[as1].y + 1 - (int32_t)(a[as1].y>>32&0xff);
|
||||
|
|
@ -470,7 +470,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
|
||||
for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling
|
||||
if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
|
||||
if (is_sr && !mi->is_hpc) {
|
||||
if (is_sr && !(mi->flag & MM_I_HPC)) {
|
||||
re = (int32_t)a[as1 + i].x + 1;
|
||||
qe = (int32_t)a[as1 + i].y + 1;
|
||||
} else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
|
||||
|
|
|
|||
29
index.c
29
index.c
|
|
@ -31,19 +31,19 @@ typedef struct mm_idx_bucket_s {
|
|||
void mm_idxopt_init(mm_idxopt_t *opt)
|
||||
{
|
||||
memset(opt, 0, sizeof(mm_idxopt_t));
|
||||
opt->k = 15, opt->w = 10, opt->is_hpc = 0;
|
||||
opt->k = 15, opt->w = 10, opt->flag = 0;
|
||||
opt->bucket_bits = 14;
|
||||
opt->mini_batch_size = 50000000;
|
||||
opt->batch_size = 4000000000ULL;
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc)
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
|
||||
{
|
||||
mm_idx_t *mi;
|
||||
if (k*2 < b) b = k * 2;
|
||||
if (w < 1) w = 1;
|
||||
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
|
||||
mi->w = w, mi->k = k, mi->b = b, mi->is_hpc = is_hpc;
|
||||
mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
|
||||
mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
|
||||
if (!(mm_dbg_flag & 1)) mi->km = km_init();
|
||||
return mi;
|
||||
|
|
@ -89,7 +89,7 @@ void mm_idx_stat(const mm_idx_t *mi)
|
|||
{
|
||||
int i, n = 0, n1 = 0;
|
||||
uint64_t sum = 0, len = 0;
|
||||
fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_HPC: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->is_hpc, mi->n_seq);
|
||||
fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
|
||||
for (i = 0; i < mi->n_seq; ++i)
|
||||
len += mi->seq[i].len;
|
||||
for (i = 0; i < 1<<mi->b; ++i)
|
||||
|
|
@ -214,7 +214,7 @@ static void mm_idx_post(mm_idx_t *mi, int n_threads)
|
|||
#include "bseq.h"
|
||||
|
||||
typedef struct {
|
||||
int mini_batch_size, keep_name;
|
||||
int mini_batch_size;
|
||||
uint64_t batch_size, sum_len;
|
||||
mm_bseq_file_t *fp;
|
||||
mm_idx_t *mi;
|
||||
|
|
@ -266,7 +266,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
for (i = 0; i < s->n_seq; ++i) {
|
||||
mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
|
||||
uint32_t j;
|
||||
if (p->keep_name) {
|
||||
if (!(p->mi->flag & MM_I_NO_NAME)) {
|
||||
seq->name = (char*)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
|
||||
strcpy(seq->name, s->seq[i].name);
|
||||
} else seq->name = 0;
|
||||
|
|
@ -288,7 +288,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
step_t *s = (step_t*)in;
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
mm_bseq1_t *t = &s->seq[i];
|
||||
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->is_hpc, &s->a);
|
||||
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
|
||||
free(t->seq); free(t->name);
|
||||
}
|
||||
free(s->seq); s->seq = 0;
|
||||
|
|
@ -301,16 +301,15 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
return 0;
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
|
||||
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
|
||||
{
|
||||
pipeline_t pl;
|
||||
if (fp == 0 || mm_bseq_eof(fp)) return 0;
|
||||
memset(&pl, 0, sizeof(pipeline_t));
|
||||
pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
|
||||
pl.keep_name = keep_name;
|
||||
pl.batch_size = batch_size;
|
||||
pl.fp = fp;
|
||||
pl.mi = mm_idx_init(w, k, b, is_hpc);
|
||||
pl.mi = mm_idx_init(w, k, b, flag);
|
||||
|
||||
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
|
||||
if (mm_verbose >= 3)
|
||||
|
|
@ -323,13 +322,13 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mi
|
|||
return pl.mi;
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) // a simpler interface
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface
|
||||
{
|
||||
mm_bseq_file_t *fp;
|
||||
mm_idx_t *mi;
|
||||
fp = mm_bseq_open(fn);
|
||||
if (fp == 0) return 0;
|
||||
mi = mm_idx_gen(fp, w, k, 14, is_hpc, 1<<18, n_threads, UINT64_MAX, 1);
|
||||
mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
|
||||
mm_bseq_close(fp);
|
||||
return mi;
|
||||
}
|
||||
|
|
@ -344,7 +343,7 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
|||
uint32_t x[5];
|
||||
int i;
|
||||
|
||||
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->is_hpc;
|
||||
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
|
||||
fwrite(MM_IDX_MAGIC, 1, 4, fp);
|
||||
fwrite(x, 4, 5, fp);
|
||||
for (i = 0; i < mi->n_seq; ++i) {
|
||||
|
|
@ -476,10 +475,10 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
|||
mm_idx_t *mi;
|
||||
if (r->is_idx) {
|
||||
mi = mm_idx_load(r->fp.idx);
|
||||
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || mi->is_hpc != r->opt.is_hpc))
|
||||
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
|
||||
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
|
||||
} else
|
||||
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1);
|
||||
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
|
||||
if (mi) {
|
||||
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
|
||||
++r->n_parts;
|
||||
|
|
|
|||
4
main.c
4
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.5-r572"
|
||||
#define MM_VERSION "2.5-r573-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -94,7 +94,7 @@ int main(int argc, char *argv[])
|
|||
while ((c = getopt_long(argc, argv, opt_str, long_options, &long_idx)) >= 0) {
|
||||
if (c == 'w') ipt.w = atoi(optarg);
|
||||
else if (c == 'k') ipt.k = atoi(optarg);
|
||||
else if (c == 'H') ipt.is_hpc = 1;
|
||||
else if (c == 'H') ipt.flag |= MM_I_HPC;
|
||||
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
|
||||
else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
|
||||
else if (c == 't') n_threads = atoi(optarg);
|
||||
|
|
|
|||
18
map.c
18
map.c
|
|
@ -64,29 +64,29 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mm_idxopt_init(io);
|
||||
mm_mapopt_init(mo);
|
||||
} else if (strcmp(preset, "ava-ont") == 0) {
|
||||
io->is_hpc = 0, io->k = 15, io->w = 5;
|
||||
io->flag = 0, io->k = 15, io->w = 5;
|
||||
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
|
||||
} else if (strcmp(preset, "ava-pb") == 0) {
|
||||
io->is_hpc = 1, io->k = 19, io->w = 5;
|
||||
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
|
||||
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
|
||||
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
|
||||
io->is_hpc = 1, io->k = 19;
|
||||
io->flag |= MM_I_HPC, io->k = 19;
|
||||
} else if (strcmp(preset, "map-ont") == 0) {
|
||||
io->is_hpc = 0, io->k = 15;
|
||||
io->flag = 0, io->k = 15;
|
||||
} else if (strcmp(preset, "asm5") == 0) {
|
||||
io->is_hpc = 0, io->k = 19, io->w = 19;
|
||||
io->flag = 0, io->k = 19, io->w = 19;
|
||||
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200;
|
||||
mo->min_dp_max = 200;
|
||||
mo->best_n = 50;
|
||||
} else if (strcmp(preset, "asm10") == 0) {
|
||||
io->is_hpc = 0, io->k = 19, io->w = 19;
|
||||
io->flag = 0, io->k = 19, io->w = 19;
|
||||
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200;
|
||||
mo->min_dp_max = 200;
|
||||
mo->best_n = 50;
|
||||
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
|
||||
io->is_hpc = 0, io->k = 21, io->w = 11;
|
||||
io->flag = 0, io->k = 21, io->w = 11;
|
||||
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS;
|
||||
mo->pe_ori = 0<<1|1; // FR
|
||||
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
|
||||
|
|
@ -104,7 +104,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mo->max_occ = 5000;
|
||||
mo->mini_batch_size = 50000000;
|
||||
} else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) {
|
||||
io->is_hpc = 0, io->k = 15, io->w = 5;
|
||||
io->flag = 0, io->k = 15, io->w = 5;
|
||||
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
|
||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
||||
|
|
@ -173,7 +173,7 @@ static void collect_minimizers(const mm_mapopt_t *opt, const mm_idx_t *mi, int n
|
|||
int i, j, n, sum = 0;
|
||||
b->mini.n = 0;
|
||||
for (i = n = 0; i < n_segs; ++i) {
|
||||
mm_sketch(b->km, seqs[i], qlens[i], mi->w, mi->k, i, mi->is_hpc, &b->mini);
|
||||
mm_sketch(b->km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, &b->mini);
|
||||
for (j = n; j < b->mini.n; ++j)
|
||||
b->mini.a[j].y += sum << 1;
|
||||
if (opt->sdust_thres > 0) // mask low-complexity minimizers
|
||||
|
|
|
|||
10
minimap.h
10
minimap.h
|
|
@ -26,6 +26,10 @@
|
|||
#define MM_F_SPLICE_FLANK 0x40000
|
||||
#define MM_F_SOFTCLIP 0x80000
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
#define MM_I_NO_NAME 0x4
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
#define MM_MAX_SEG 255
|
||||
|
|
@ -46,7 +50,7 @@ typedef struct {
|
|||
} mm_idx_seq_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t b, w, k, is_hpc;
|
||||
int32_t b, w, k, flag;
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
|
|
@ -80,7 +84,7 @@ typedef struct {
|
|||
|
||||
// indexing and mapping options
|
||||
typedef struct {
|
||||
short k, w, is_hpc, bucket_bits;
|
||||
short k, w, flag, bucket_bits;
|
||||
int mini_batch_size;
|
||||
uint64_t batch_size;
|
||||
} mm_idxopt_t;
|
||||
|
|
@ -269,7 +273,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
|
|||
|
||||
// deprecated APIs for backward compatibility
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ cdef extern from "minimap.h":
|
|||
# Options
|
||||
#
|
||||
ctypedef struct mm_idxopt_t:
|
||||
short k, w, is_hpc, bucket_bits
|
||||
short k, w, flag, bucket_bits
|
||||
int mini_batch_size
|
||||
uint64_t batch_size
|
||||
|
||||
|
|
@ -51,7 +51,7 @@ cdef extern from "minimap.h":
|
|||
pass
|
||||
|
||||
ctypedef struct mm_idx_t:
|
||||
int32_t b, w, k, is_hpc
|
||||
int32_t b, w, k, flag
|
||||
uint32_t n_seq
|
||||
mm_idx_seq_t *seq
|
||||
uint32_t *S
|
||||
|
|
|
|||
Loading…
Reference in New Issue