r367: index reader optionally writes index

This commit is contained in:
Heng Li 2017-09-14 21:18:13 -04:00
parent eb00521d9b
commit e2823d4aee
5 changed files with 83 additions and 90 deletions

17
index.c
View File

@ -21,6 +21,13 @@ typedef khash_t(idx) idxhash_t;
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
typedef struct mm_idx_bucket_s {
mm128_v a; // (minimizer, position) array
int32_t n; // size of the _p_ array
uint64_t *p; // position array for minimizers appearing >1 times
void *h; // hash table indexing _p_ and minimizers appearing once
} mm_idx_bucket_t;
void mm_idxopt_init(mm_idxopt_t *opt)
{
memset(opt, 0, sizeof(mm_idxopt_t));
@ -439,7 +446,7 @@ int mm_idx_is_idx(const char *fn)
return is_idx;
}
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt)
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
{
int is_idx;
mm_idx_reader_t *r;
@ -450,6 +457,7 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt)
r->opt = *opt;
if (r->is_idx) r->fp.idx = fopen(fn, "rb");
else r->fp.seq = mm_bseq_open(fn);
if (fn_out) r->fp_out = fopen(fn_out, "wb");
return r;
}
@ -457,6 +465,8 @@ void mm_idx_reader_close(mm_idx_reader_t *r)
{
if (r->is_idx) fclose(r->fp.idx);
else mm_bseq_close(r->fp.seq);
if (r->fp_out) fclose(r->fp_out);
free(r);
}
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
@ -468,6 +478,9 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
fprintf(stderr, "[WARNING] Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\n");
} else
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1);
if (mi) ++r->n_parts;
if (mi) {
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
++r->n_parts;
}
return mi;
}

19
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.1.1-r365-dirty"
#define MM_VERSION "2.1.1-r367-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -65,13 +65,12 @@ int main(int argc, char *argv[])
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx, max_intron_len = 0;
char *fnw = 0, *rg = 0, *s;
FILE *fpw = 0, *fp_help = stderr;
FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
liftrlimit();
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
mm_idxopt_init(&ipt);
mm_set_opt(0, &ipt, &opt);
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
if (c == 'w') ipt.w = atoi(optarg);
@ -139,7 +138,7 @@ int main(int argc, char *argv[])
opt.e = opt.e2 = strtol(optarg, &s, 10);
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
} else if (c == 'x') {
if (mm_preset(optarg, &ipt, &opt) < 0) {
if (mm_set_opt(optarg, &ipt, &opt) < 0) {
fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
return 1;
}
@ -200,7 +199,7 @@ int main(int argc, char *argv[])
return fp_help == stdout? 0 : 1;
}
idx_rdr = mm_idx_reader_open(argv[optind], &ipt);
idx_rdr = mm_idx_reader_open(argv[optind], &ipt, fnw);
if (idx_rdr == 0) {
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
return 1;
@ -209,7 +208,6 @@ int main(int argc, char *argv[])
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
return 1;
}
if (fnw) fpw = fopen(fnw, "wb");
if (opt.flag & MM_F_OUT_SAM)
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
for (;;) {
@ -220,18 +218,13 @@ int main(int argc, char *argv[])
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
if (fpw) {
mm_idx_dump(fpw, mi);
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the current part of the index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
}
if (argc != optind + 1) mm_mapopt_update(&opt, mi);
if (mm_verbose >= 3) mm_idx_stat(mi);
for (i = optind + 1; i < argc; ++i)
mm_map_file(mi, argv[i], &opt, n_threads);
mm_idx_destroy(mi);
}
if (fpw) fclose(fpw);
mm_idx_reader_close(idx_rdr);
fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
fprintf(stderr, "[M::%s] CMD:", __func__);

13
map.c
View File

@ -11,7 +11,6 @@
void mm_mapopt_init(mm_mapopt_t *opt)
{
memset(opt, 0, sizeof(mm_mapopt_t));
opt->max_occ_frac = 1e-5f;
opt->mid_occ_frac = 2e-4f;
opt->sdust_thres = 0;
@ -41,18 +40,18 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
{
if (opt->flag & MM_F_SPLICE_BOTH)
opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
if (opt->max_occ <= 0)
opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac);
if (opt->mid_occ <= 0)
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d; max_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0),
opt->mid_occ, opt->max_occ);
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
}
int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
{
if (strcmp(preset, "ava-ont") == 0) {
if (preset == 0) {
mm_idxopt_init(io);
mm_mapopt_init(mo);
} else if (strcmp(preset, "ava-ont") == 0) {
io->is_hpc = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;

119
minimap.h
View File

@ -25,21 +25,16 @@
extern "C" {
#endif
typedef struct {
uint64_t x, y;
} mm128_t;
// hidden structures
struct mm_idx_bucket_s;
struct mm_bseq_file_s;
struct mm_tbuf_s;
// emulate 128-bit integers and arrays
typedef struct { uint64_t x, y; } mm128_t;
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; uint32_t *a; } uint32_v;
typedef struct {
mm128_v a; // (minimizer, position) array
int32_t n; // size of the _p_ array
uint64_t *p; // position array for minimizers appearing >1 times
void *h; // hash table indexing _p_ and minimizers appearing once
} mm_idx_bucket_t;
// minimap2 index
typedef struct {
char *name; // name of the db sequence
uint64_t offset; // offset in mm_idx_t::S
@ -48,43 +43,50 @@ typedef struct {
typedef struct {
int32_t b, w, k, is_hpc;
uint32_t n_seq; // number of reference sequences
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
mm_idx_bucket_t *B; // index
uint32_t n_seq; // number of reference sequences
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
void *km;
} mm_idx_t;
// minimap2 alignment
typedef struct {
uint32_t capacity;
int32_t dp_score, dp_max, dp_max2;
uint32_t blen;
uint32_t n_diff;
uint32_t n_ambi:30, trans_strand:2;
uint32_t n_cigar;
uint32_t capacity; // the capacity of cigar[]
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
uint32_t blen; // block length
uint32_t n_diff; // number of differences, including ambiguous bases
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
uint32_t n_cigar; // number of cigar operations in cigar[]
uint32_t cigar[];
} mm_extra_t;
typedef struct {
int32_t id;
uint32_t cnt:31, rev:1;
uint32_t rid:31, inv:1;
int32_t score;
int32_t qs, qe, rs, re;
int32_t parent, subsc;
int32_t as;
int32_t fuzzy_mlen, fuzzy_blen;
uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // TODO: n_sub is not used for now
int32_t id; // ID for internal uses (see also parent below)
uint32_t cnt:31, rev:1; // number of minimizers; if on the reverse strand
uint32_t rid:31, inv:1; // reference index; if this is an alignment from inversion rescue
int32_t score; // DP alignment score
int32_t qs, qe, rs, re; // query start and end; reference start and end
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
int32_t as; // offset in the a[] array (for internal uses only)
int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate)
uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings
mm_extra_t *p;
} mm_reg1_t;
// indexing and mapping options
typedef struct {
float max_occ_frac;
float mid_occ_frac;
int sdust_thres; // score threshold for SDUST; 0 to disable
int flag; // see MM_F_* macros
short k, w, is_hpc, bucket_bits;
int mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
int bw; // bandwidth
typedef struct {
float mid_occ_frac;
int sdust_thres; // score threshold for SDUST; 0 to disable
int flag; // see MM_F_* macros
int bw; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_chain_skip;
int min_cnt;
@ -104,63 +106,44 @@ typedef struct {
int min_ksw_len;
int mini_batch_size;
int32_t max_occ, mid_occ;
int32_t mid_occ;
} mm_mapopt_t;
typedef struct {
short k, w, is_hpc, bucket_bits;
int mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
struct mm_bseq_file_s;
// index reader
typedef struct {
int is_idx, n_parts;
mm_idxopt_t opt;
FILE *fp_out;
union {
struct mm_bseq_file_s *seq;
FILE *idx;
} fp;
} mm_idx_reader_t;
extern int mm_verbose, mm_dbg_flag;
extern double mm_realtime0;
struct mm_tbuf_s;
// memory buffer for thread-local storage during mapping
typedef struct mm_tbuf_s mm_tbuf_t;
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
// global variables
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
extern double mm_realtime0; // wall-clock timer
void mm_idxopt_init(mm_idxopt_t *opt);
void mm_mapopt_init(mm_mapopt_t *opt);
int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt);
int mm_idx_reader_is_idx(const mm_idx_reader_t *r);
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
void mm_idx_reader_close(mm_idx_reader_t *r);
void mm_idx_stat(const mm_idx_t *idx);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
void mm_idx_destroy(mm_idx_t *mi);
// minimizer indexing
void mm_idx_stat(const mm_idx_t *idx);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
// minimizer index I/O
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
mm_idx_t *mm_idx_load(FILE *fp);
// mapping
mm_tbuf_t *mm_tbuf_init(void);
void mm_tbuf_destroy(mm_tbuf_t *b);
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
// obsolete APIs (for backward compatibility)
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
#ifdef __cplusplus

View File

@ -21,6 +21,9 @@
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
#ifdef __cplusplus
extern "C" {
#endif
@ -47,6 +50,8 @@ void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *arg
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);