r573: prepare to generalize index

This commit is contained in:
Heng Li 2017-11-11 19:54:06 -05:00
parent 3b518271ee
commit 2f463b1db0
6 changed files with 38 additions and 35 deletions

View File

@ -222,7 +222,7 @@ static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x
static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], mm128_t *a, int32_t *r, int32_t *q)
{
if (mi->is_hpc) {
if (mi->flag & MM_I_HPC) {
const uint8_t *qseq = qseq0[a->x>>63];
int i, c;
*q = (int32_t)a->y;
@ -351,14 +351,14 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
int32_t rs1, qs1, re1, qe1;
int8_t mat[25];
if (is_sr) assert(!mi->is_hpc); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
if (is_sr) assert(!(mi->flag & MM_I_HPC)); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
r2->cnt = 0;
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b);
bw = (int)(opt->bw * 1.5 + 1.);
if (is_sr && !mi->is_hpc) {
if (is_sr && !(mi->flag & MM_I_HPC)) {
mm_max_stretch(opt, r, a, &as1, &cnt1);
rs = (int32_t)a[as1].x + 1 - (int32_t)(a[as1].y>>32&0xff);
qs = (int32_t)a[as1].y + 1 - (int32_t)(a[as1].y>>32&0xff);
@ -470,7 +470,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling
if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
if (is_sr && !mi->is_hpc) {
if (is_sr && !(mi->flag & MM_I_HPC)) {
re = (int32_t)a[as1 + i].x + 1;
qe = (int32_t)a[as1 + i].y + 1;
} else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);

29
index.c
View File

@ -31,19 +31,19 @@ typedef struct mm_idx_bucket_s {
void mm_idxopt_init(mm_idxopt_t *opt)
{
memset(opt, 0, sizeof(mm_idxopt_t));
opt->k = 15, opt->w = 10, opt->is_hpc = 0;
opt->k = 15, opt->w = 10, opt->flag = 0;
opt->bucket_bits = 14;
opt->mini_batch_size = 50000000;
opt->batch_size = 4000000000ULL;
}
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc)
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
{
mm_idx_t *mi;
if (k*2 < b) b = k * 2;
if (w < 1) w = 1;
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
mi->w = w, mi->k = k, mi->b = b, mi->is_hpc = is_hpc;
mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
if (!(mm_dbg_flag & 1)) mi->km = km_init();
return mi;
@ -89,7 +89,7 @@ void mm_idx_stat(const mm_idx_t *mi)
{
int i, n = 0, n1 = 0;
uint64_t sum = 0, len = 0;
fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_HPC: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->is_hpc, mi->n_seq);
fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
for (i = 0; i < mi->n_seq; ++i)
len += mi->seq[i].len;
for (i = 0; i < 1<<mi->b; ++i)
@ -214,7 +214,7 @@ static void mm_idx_post(mm_idx_t *mi, int n_threads)
#include "bseq.h"
typedef struct {
int mini_batch_size, keep_name;
int mini_batch_size;
uint64_t batch_size, sum_len;
mm_bseq_file_t *fp;
mm_idx_t *mi;
@ -266,7 +266,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
for (i = 0; i < s->n_seq; ++i) {
mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
uint32_t j;
if (p->keep_name) {
if (!(p->mi->flag & MM_I_NO_NAME)) {
seq->name = (char*)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
strcpy(seq->name, s->seq[i].name);
} else seq->name = 0;
@ -288,7 +288,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
step_t *s = (step_t*)in;
for (i = 0; i < s->n_seq; ++i) {
mm_bseq1_t *t = &s->seq[i];
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->is_hpc, &s->a);
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
free(t->seq); free(t->name);
}
free(s->seq); s->seq = 0;
@ -301,16 +301,15 @@ static void *worker_pipeline(void *shared, int step, void *in)
return 0;
}
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
{
pipeline_t pl;
if (fp == 0 || mm_bseq_eof(fp)) return 0;
memset(&pl, 0, sizeof(pipeline_t));
pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
pl.keep_name = keep_name;
pl.batch_size = batch_size;
pl.fp = fp;
pl.mi = mm_idx_init(w, k, b, is_hpc);
pl.mi = mm_idx_init(w, k, b, flag);
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
if (mm_verbose >= 3)
@ -323,13 +322,13 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mi
return pl.mi;
}
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) // a simpler interface
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface
{
mm_bseq_file_t *fp;
mm_idx_t *mi;
fp = mm_bseq_open(fn);
if (fp == 0) return 0;
mi = mm_idx_gen(fp, w, k, 14, is_hpc, 1<<18, n_threads, UINT64_MAX, 1);
mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
mm_bseq_close(fp);
return mi;
}
@ -344,7 +343,7 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
uint32_t x[5];
int i;
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->is_hpc;
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
fwrite(MM_IDX_MAGIC, 1, 4, fp);
fwrite(x, 4, 5, fp);
for (i = 0; i < mi->n_seq; ++i) {
@ -476,10 +475,10 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
mm_idx_t *mi;
if (r->is_idx) {
mi = mm_idx_load(r->fp.idx);
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || mi->is_hpc != r->opt.is_hpc))
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
} else
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1);
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
if (mi) {
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
++r->n_parts;

4
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.5-r572"
#define MM_VERSION "2.5-r573-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -94,7 +94,7 @@ int main(int argc, char *argv[])
while ((c = getopt_long(argc, argv, opt_str, long_options, &long_idx)) >= 0) {
if (c == 'w') ipt.w = atoi(optarg);
else if (c == 'k') ipt.k = atoi(optarg);
else if (c == 'H') ipt.is_hpc = 1;
else if (c == 'H') ipt.flag |= MM_I_HPC;
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
else if (c == 't') n_threads = atoi(optarg);

18
map.c
View File

@ -64,29 +64,29 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mm_idxopt_init(io);
mm_mapopt_init(mo);
} else if (strcmp(preset, "ava-ont") == 0) {
io->is_hpc = 0, io->k = 15, io->w = 5;
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
} else if (strcmp(preset, "ava-pb") == 0) {
io->is_hpc = 1, io->k = 19, io->w = 5;
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
io->is_hpc = 1, io->k = 19;
io->flag |= MM_I_HPC, io->k = 19;
} else if (strcmp(preset, "map-ont") == 0) {
io->is_hpc = 0, io->k = 15;
io->flag = 0, io->k = 15;
} else if (strcmp(preset, "asm5") == 0) {
io->is_hpc = 0, io->k = 19, io->w = 19;
io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200;
mo->min_dp_max = 200;
mo->best_n = 50;
} else if (strcmp(preset, "asm10") == 0) {
io->is_hpc = 0, io->k = 19, io->w = 19;
io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200;
mo->min_dp_max = 200;
mo->best_n = 50;
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
io->is_hpc = 0, io->k = 21, io->w = 11;
io->flag = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS;
mo->pe_ori = 0<<1|1; // FR
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
@ -104,7 +104,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->max_occ = 5000;
mo->mini_batch_size = 50000000;
} else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) {
io->is_hpc = 0, io->k = 15, io->w = 5;
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
@ -173,7 +173,7 @@ static void collect_minimizers(const mm_mapopt_t *opt, const mm_idx_t *mi, int n
int i, j, n, sum = 0;
b->mini.n = 0;
for (i = n = 0; i < n_segs; ++i) {
mm_sketch(b->km, seqs[i], qlens[i], mi->w, mi->k, i, mi->is_hpc, &b->mini);
mm_sketch(b->km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, &b->mini);
for (j = n; j < b->mini.n; ++j)
b->mini.a[j].y += sum << 1;
if (opt->sdust_thres > 0) // mask low-complexity minimizers

View File

@ -26,6 +26,10 @@
#define MM_F_SPLICE_FLANK 0x40000
#define MM_F_SOFTCLIP 0x80000
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
#define MM_I_NO_NAME 0x4
#define MM_IDX_MAGIC "MMI\2"
#define MM_MAX_SEG 255
@ -46,7 +50,7 @@ typedef struct {
} mm_idx_seq_t;
typedef struct {
int32_t b, w, k, is_hpc;
int32_t b, w, k, flag;
uint32_t n_seq; // number of reference sequences
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
@ -80,7 +84,7 @@ typedef struct {
// indexing and mapping options
typedef struct {
short k, w, is_hpc, bucket_bits;
short k, w, flag, bucket_bits;
int mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
@ -269,7 +273,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
#ifdef __cplusplus
}

View File

@ -5,7 +5,7 @@ cdef extern from "minimap.h":
# Options
#
ctypedef struct mm_idxopt_t:
short k, w, is_hpc, bucket_bits
short k, w, flag, bucket_bits
int mini_batch_size
uint64_t batch_size
@ -51,7 +51,7 @@ cdef extern from "minimap.h":
pass
ctypedef struct mm_idx_t:
int32_t b, w, k, is_hpc
int32_t b, w, k, flag
uint32_t n_seq
mm_idx_seq_t *seq
uint32_t *S