redesigned indexing and option APIs

This commit is contained in:
Heng Li 2017-09-14 17:02:01 -04:00
parent 0f7455cefa
commit eb00521d9b
5 changed files with 151 additions and 98 deletions

44
index.c
View File

@ -21,6 +21,15 @@ typedef khash_t(idx) idxhash_t;
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
void mm_idxopt_init(mm_idxopt_t *opt)
{
memset(opt, 0, sizeof(mm_idxopt_t));
opt->k = 15, opt->w = 10, opt->is_hpc = 0;
opt->bucket_bits = 14;
opt->mini_batch_size = 50000000;
opt->batch_size = 4000000000ULL;
}
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc)
{
mm_idx_t *mi;
@ -314,7 +323,7 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads)
mm_idx_t *mi;
fp = mm_bseq_open(fn);
if (fp == 0) return 0;
mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, is_hpc, 1<<18, n_threads, UINT64_MAX, 1);
mi = mm_idx_gen(fp, w, k, 14, is_hpc, 1<<18, n_threads, UINT64_MAX, 1);
mm_bseq_close(fp);
return mi;
}
@ -429,3 +438,36 @@ int mm_idx_is_idx(const char *fn)
close(fd);
return is_idx;
}
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt)
{
int is_idx;
mm_idx_reader_t *r;
is_idx = mm_idx_is_idx(fn);
if (is_idx < 0) return 0; // failed to open the index
r = (mm_idx_reader_t*)calloc(1, sizeof(mm_idx_reader_t));
r->is_idx = is_idx;
r->opt = *opt;
if (r->is_idx) r->fp.idx = fopen(fn, "rb");
else r->fp.seq = mm_bseq_open(fn);
return r;
}
void mm_idx_reader_close(mm_idx_reader_t *r)
{
if (r->is_idx) fclose(r->fp.idx);
else mm_bseq_close(r->fp.seq);
}
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
{
mm_idx_t *mi;
if (r->is_idx) {
mi = mm_idx_load(r->fp.idx);
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || mi->is_hpc != r->opt.is_hpc))
fprintf(stderr, "[WARNING] Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\n");
} else
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1);
if (mi) ++r->n_parts;
return mi;
}

103
main.c
View File

@ -25,7 +25,7 @@ void liftrlimit() {}
static struct option long_options[] = {
{ "bucket-bits", required_argument, 0, 0 },
{ "mb-size", required_argument, 0, 'K' },
{ "int-rname", no_argument, 0, 0 },
{ "int-rname", no_argument, 0, 0 }, // obsolete; kept as a placeholder
{ "no-kalloc", no_argument, 0, 0 },
{ "print-qname", no_argument, 0, 0 },
{ "no-self", no_argument, 0, 0 },
@ -62,21 +62,21 @@ static inline int64_t mm_parse_num(const char *str)
int main(int argc, char *argv[])
{
mm_mapopt_t opt;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0;
int minibatch_size = 200000000;
uint64_t batch_size = 4000000000ULL;
mm_bseq_file_t *fp = 0;
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx, max_intron_len = 0;
char *fnw = 0, *rg = 0, *s;
FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
FILE *fpw = 0, *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
liftrlimit();
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
mm_idxopt_init(&ipt);
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
if (c == 'w') ipt.w = atoi(optarg);
else if (c == 'k') ipt.k = atoi(optarg);
else if (c == 'H') ipt.is_hpc = 1;
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
else if (c == 't') n_threads = atoi(optarg);
@ -98,12 +98,11 @@ int main(int argc, char *argv[])
else if (c == 'B') opt.b = atoi(optarg);
else if (c == 'z') opt.zdrop = atoi(optarg);
else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'I') batch_size = mm_parse_num(optarg);
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
else if (c == 'I') ipt.batch_size = mm_parse_num(optarg);
else if (c == 'K') ipt.mini_batch_size = (int)mm_parse_num(optarg);
else if (c == 'R') rg = optarg;
else if (c == 'h') fp_help = stdout;
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
else if (c == 0 && long_idx == 0) ipt.bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname
else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self
@ -140,55 +139,12 @@ int main(int argc, char *argv[])
opt.e = opt.e2 = strtol(optarg, &s, 10);
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
} else if (c == 'x') {
if (strcmp(optarg, "ava-ont") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.max_gap = 10000, opt.max_chain_skip = 25;
minibatch_size = 500000000;
k = 15, w = 5;
} else if (strcmp(optarg, "ava-pb") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.max_gap = 10000, opt.max_chain_skip = 25;
minibatch_size = 500000000;
is_hpc = 1, k = 19, w = 5;
} else if (strcmp(optarg, "map10k") == 0 || strcmp(optarg, "map-pb") == 0) {
is_hpc = 1, k = 19;
} else if (strcmp(optarg, "map-ont") == 0) {
is_hpc = 0, k = 15;
} else if (strcmp(optarg, "asm5") == 0) {
k = 19, w = 19;
opt.a = 1, opt.b = 19, opt.q = 39, opt.q2 = 81, opt.e = 3, opt.e2 = 1, opt.zdrop = 200;
opt.min_dp_max = 200;
} else if (strcmp(optarg, "asm10") == 0) {
k = 19, w = 19;
opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200;
opt.min_dp_max = 200;
} else if (strcmp(optarg, "short") == 0 || strcmp(optarg, "sr") == 0) {
k = 21, w = 11, is_hpc = 0;
minibatch_size = 50000000;
opt.flag |= MM_F_APPROX_EXT;
opt.a = 2, opt.b = 8, opt.q = 12, opt.e = 2, opt.q2 = 32, opt.e2 = 1;
opt.max_gap = 100;
opt.pri_ratio = 0.5f;
opt.min_cnt = 2;
opt.min_chain_score = 20;
opt.min_dp_max = 40;
opt.best_n = 20;
opt.bw = 50;
opt.mid_occ = 1000;
} else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) {
k = 15, w = 5;
opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000;
opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0;
opt.noncan = 5;
opt.zdrop = 200;
} else {
if (mm_preset(optarg, &ipt, &opt) < 0) {
fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
return 1;
}
}
}
if (w < 0) w = (int)(.6666667 * k + .499);
if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0)
opt.max_gap_ref = opt.bw = max_intron_len;
@ -197,8 +153,8 @@ int main(int argc, char *argv[])
fprintf(fp_help, "Options:\n");
fprintf(fp_help, " Indexing:\n");
fprintf(fp_help, " -H use homopolymer-compressed k-mer\n");
fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", k);
fprintf(fp_help, " -w INT minizer window size [{-k}*2/3]\n");
fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k);
fprintf(fp_help, " -w INT minizer window size [%d]\n", ipt.w);
fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(fp_help, " -d FILE dump index to FILE []\n");
fprintf(fp_help, " Mapping:\n");
@ -227,7 +183,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -c output CIGAR in PAF\n");
fprintf(fp_help, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n");
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
fprintf(fp_help, " -K NUM minibatch size [200M]\n");
fprintf(fp_help, " -K NUM minibatch size for mapping [200M]\n");
// fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose);
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
@ -244,33 +200,22 @@ int main(int argc, char *argv[])
return fp_help == stdout? 0 : 1;
}
is_idx = mm_idx_is_idx(argv[optind]);
if (is_idx < 0) {
idx_rdr = mm_idx_reader_open(argv[optind], &ipt);
if (idx_rdr == 0) {
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
return 1;
}
if (!is_idx && fnw == 0 && argc - optind < 2) {
if (!idx_rdr->is_idx && fnw == 0 && argc - optind < 2) {
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
return 1;
}
if (is_idx) fpr = fopen(argv[optind], "rb");
else fp = mm_bseq_open(argv[optind]);
if (fnw) fpw = fopen(fnw, "wb");
if (opt.flag & MM_F_OUT_SAM)
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
for (;;) {
mm_idx_t *mi;
if (fpr) {
mi = mm_idx_load(fpr);
if (mi == 0) break;
if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc))
fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n");
} else {
mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name);
}
if (mi == 0) break;
++n_idx_part;
if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
if ((mi = mm_idx_reader_read(idx_rdr, n_threads)) == 0) break;
if (mm_verbose >= 2 && idx_rdr->n_parts > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
@ -278,17 +223,15 @@ int main(int argc, char *argv[])
if (fpw) {
mm_idx_dump(fpw, mi);
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the (partial) index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the current part of the index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
}
if (argc != optind + 1) mm_mapopt_update(&opt, mi);
if (mm_verbose >= 3) mm_idx_stat(mi);
for (i = optind + 1; i < argc; ++i)
mm_map_file(mi, argv[i], &opt, n_threads, minibatch_size);
mm_map_file(mi, argv[i], &opt, n_threads);
mm_idx_destroy(mi);
}
if (fpw) fclose(fpw);
if (fpr) fclose(fpr);
if (fp) mm_bseq_close(fp);
fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
fprintf(stderr, "[M::%s] CMD:", __func__);

53
map.c
View File

@ -34,6 +34,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->zdrop = 400;
opt->min_dp_max = opt->min_chain_score * opt->a;
opt->min_ksw_len = 200;
opt->mini_batch_size = 200000000;
}
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
@ -49,6 +50,54 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
opt->mid_occ, opt->max_occ);
}
int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
{
if (strcmp(preset, "ava-ont") == 0) {
io->is_hpc = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
mo->mini_batch_size = 500000000;
} else if (strcmp(preset, "ava-pb") == 0) {
io->is_hpc = 1, io->k = 19, io->w = 5;
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
mo->mini_batch_size = 500000000;
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
io->is_hpc = 1, io->k = 19;
} else if (strcmp(preset, "map-ont") == 0) {
io->is_hpc = 0, io->k = 15;
} else if (strcmp(preset, "asm5") == 0) {
io->is_hpc = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200;
mo->min_dp_max = 200;
} else if (strcmp(preset, "asm10") == 0) {
io->is_hpc = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200;
mo->min_dp_max = 200;
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
io->is_hpc = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_APPROX_EXT;
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1;
mo->max_gap = 100;
mo->pri_ratio = 0.5f;
mo->min_cnt = 2;
mo->min_chain_score = 20;
mo->min_dp_max = 40;
mo->best_n = 20;
mo->bw = 50;
mo->mid_occ = 1000;
mo->mini_batch_size = 50000000;
} else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) {
io->is_hpc = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
mo->noncan = 5;
mo->zdrop = 200;
} else return -1;
return 0;
}
typedef struct {
uint32_t n;
uint32_t qpos;
@ -304,7 +353,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
return 0;
}
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int mini_batch_size)
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads)
{
pipeline_t pl;
memset(&pl, 0, sizeof(pipeline_t));
@ -315,7 +364,7 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
return -1;
}
pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
pl.n_threads = n_threads, pl.mini_batch_size = opt->mini_batch_size;
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
mm_write_sam_SQ(idx);
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);

View File

@ -5,8 +5,6 @@
#include <stdio.h>
#include <sys/types.h>
#define MM_IDX_DEF_B 14
#define MM_F_NO_SELF 0x001
#define MM_F_AVA 0x002
#define MM_F_CIGAR 0x004
@ -105,46 +103,65 @@ typedef struct {
int min_dp_max;
int min_ksw_len;
int mini_batch_size;
int32_t max_occ, mid_occ;
} mm_mapopt_t;
typedef struct {
short k, w, is_hpc, bucket_bits;
int mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
struct mm_bseq_file_s;
typedef struct {
int is_idx, n_parts;
mm_idxopt_t opt;
union {
struct mm_bseq_file_s *seq;
FILE *idx;
} fp;
} mm_idx_reader_t;
extern int mm_verbose, mm_dbg_flag;
extern double mm_realtime0;
struct mm_tbuf_s;
typedef struct mm_tbuf_s mm_tbuf_t;
struct mm_bseq_file_s;
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
// compute minimizers
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
void mm_idxopt_init(mm_idxopt_t *opt);
void mm_mapopt_init(mm_mapopt_t *opt);
int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt);
int mm_idx_reader_is_idx(const mm_idx_reader_t *r);
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
void mm_idx_reader_close(mm_idx_reader_t *r);
void mm_idx_destroy(mm_idx_t *mi);
// minimizer indexing
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc);
void mm_idx_destroy(mm_idx_t *mi);
mm_idx_t *mm_idx_gen(struct mm_bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name);
void mm_idx_stat(const mm_idx_t *idx);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
int mm_idx_is_idx(const char *fn);
// minimizer index I/O
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
mm_idx_t *mm_idx_load(FILE *fp);
// mapping
void mm_mapopt_init(mm_mapopt_t *opt);
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
mm_tbuf_t *mm_tbuf_init(void);
void mm_tbuf_destroy(mm_tbuf_t *b);
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size);
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
// obsolete APIs (for backward compatibility)
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
#ifdef __cplusplus
}

View File

@ -40,6 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
void mm_write_sam_SQ(const mm_idx_t *idx);
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);