r367: index reader optionally writes index
This commit is contained in:
parent
eb00521d9b
commit
e2823d4aee
17
index.c
17
index.c
|
|
@ -21,6 +21,13 @@ typedef khash_t(idx) idxhash_t;
|
|||
|
||||
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
|
||||
|
||||
typedef struct mm_idx_bucket_s {
|
||||
mm128_v a; // (minimizer, position) array
|
||||
int32_t n; // size of the _p_ array
|
||||
uint64_t *p; // position array for minimizers appearing >1 times
|
||||
void *h; // hash table indexing _p_ and minimizers appearing once
|
||||
} mm_idx_bucket_t;
|
||||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt)
|
||||
{
|
||||
memset(opt, 0, sizeof(mm_idxopt_t));
|
||||
|
|
@ -439,7 +446,7 @@ int mm_idx_is_idx(const char *fn)
|
|||
return is_idx;
|
||||
}
|
||||
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt)
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
|
||||
{
|
||||
int is_idx;
|
||||
mm_idx_reader_t *r;
|
||||
|
|
@ -450,6 +457,7 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt)
|
|||
r->opt = *opt;
|
||||
if (r->is_idx) r->fp.idx = fopen(fn, "rb");
|
||||
else r->fp.seq = mm_bseq_open(fn);
|
||||
if (fn_out) r->fp_out = fopen(fn_out, "wb");
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
@ -457,6 +465,8 @@ void mm_idx_reader_close(mm_idx_reader_t *r)
|
|||
{
|
||||
if (r->is_idx) fclose(r->fp.idx);
|
||||
else mm_bseq_close(r->fp.seq);
|
||||
if (r->fp_out) fclose(r->fp_out);
|
||||
free(r);
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
||||
|
|
@ -468,6 +478,9 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
|||
fprintf(stderr, "[WARNING] Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\n");
|
||||
} else
|
||||
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1);
|
||||
if (mi) ++r->n_parts;
|
||||
if (mi) {
|
||||
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
|
||||
++r->n_parts;
|
||||
}
|
||||
return mi;
|
||||
}
|
||||
|
|
|
|||
19
main.c
19
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.1.1-r365-dirty"
|
||||
#define MM_VERSION "2.1.1-r367-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -65,13 +65,12 @@ int main(int argc, char *argv[])
|
|||
mm_idxopt_t ipt;
|
||||
int i, c, n_threads = 3, long_idx, max_intron_len = 0;
|
||||
char *fnw = 0, *rg = 0, *s;
|
||||
FILE *fpw = 0, *fp_help = stderr;
|
||||
FILE *fp_help = stderr;
|
||||
mm_idx_reader_t *idx_rdr;
|
||||
|
||||
liftrlimit();
|
||||
mm_realtime0 = realtime();
|
||||
mm_mapopt_init(&opt);
|
||||
mm_idxopt_init(&ipt);
|
||||
mm_set_opt(0, &ipt, &opt);
|
||||
|
||||
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
|
||||
if (c == 'w') ipt.w = atoi(optarg);
|
||||
|
|
@ -139,7 +138,7 @@ int main(int argc, char *argv[])
|
|||
opt.e = opt.e2 = strtol(optarg, &s, 10);
|
||||
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
|
||||
} else if (c == 'x') {
|
||||
if (mm_preset(optarg, &ipt, &opt) < 0) {
|
||||
if (mm_set_opt(optarg, &ipt, &opt) < 0) {
|
||||
fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -200,7 +199,7 @@ int main(int argc, char *argv[])
|
|||
return fp_help == stdout? 0 : 1;
|
||||
}
|
||||
|
||||
idx_rdr = mm_idx_reader_open(argv[optind], &ipt);
|
||||
idx_rdr = mm_idx_reader_open(argv[optind], &ipt, fnw);
|
||||
if (idx_rdr == 0) {
|
||||
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
|
||||
return 1;
|
||||
|
|
@ -209,7 +208,6 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
|
||||
return 1;
|
||||
}
|
||||
if (fnw) fpw = fopen(fnw, "wb");
|
||||
if (opt.flag & MM_F_OUT_SAM)
|
||||
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
|
||||
for (;;) {
|
||||
|
|
@ -220,18 +218,13 @@ int main(int argc, char *argv[])
|
|||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
|
||||
if (fpw) {
|
||||
mm_idx_dump(fpw, mi);
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the current part of the index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
||||
}
|
||||
if (argc != optind + 1) mm_mapopt_update(&opt, mi);
|
||||
if (mm_verbose >= 3) mm_idx_stat(mi);
|
||||
for (i = optind + 1; i < argc; ++i)
|
||||
mm_map_file(mi, argv[i], &opt, n_threads);
|
||||
mm_idx_destroy(mi);
|
||||
}
|
||||
if (fpw) fclose(fpw);
|
||||
mm_idx_reader_close(idx_rdr);
|
||||
|
||||
fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
|
||||
fprintf(stderr, "[M::%s] CMD:", __func__);
|
||||
|
|
|
|||
13
map.c
13
map.c
|
|
@ -11,7 +11,6 @@
|
|||
void mm_mapopt_init(mm_mapopt_t *opt)
|
||||
{
|
||||
memset(opt, 0, sizeof(mm_mapopt_t));
|
||||
opt->max_occ_frac = 1e-5f;
|
||||
opt->mid_occ_frac = 2e-4f;
|
||||
opt->sdust_thres = 0;
|
||||
|
||||
|
|
@ -41,18 +40,18 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
|||
{
|
||||
if (opt->flag & MM_F_SPLICE_BOTH)
|
||||
opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
|
||||
if (opt->max_occ <= 0)
|
||||
opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac);
|
||||
if (opt->mid_occ <= 0)
|
||||
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d; max_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0),
|
||||
opt->mid_occ, opt->max_occ);
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
|
||||
}
|
||||
|
||||
int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||
{
|
||||
if (strcmp(preset, "ava-ont") == 0) {
|
||||
if (preset == 0) {
|
||||
mm_idxopt_init(io);
|
||||
mm_mapopt_init(mo);
|
||||
} else if (strcmp(preset, "ava-ont") == 0) {
|
||||
io->is_hpc = 0, io->k = 15, io->w = 5;
|
||||
mo->flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
|
||||
|
|
|
|||
119
minimap.h
119
minimap.h
|
|
@ -25,21 +25,16 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} mm128_t;
|
||||
// hidden structures
|
||||
struct mm_idx_bucket_s;
|
||||
struct mm_bseq_file_s;
|
||||
struct mm_tbuf_s;
|
||||
|
||||
// emulate 128-bit integers and arrays
|
||||
typedef struct { uint64_t x, y; } mm128_t;
|
||||
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
|
||||
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
||||
typedef struct { size_t n, m; uint32_t *a; } uint32_v;
|
||||
|
||||
typedef struct {
|
||||
mm128_v a; // (minimizer, position) array
|
||||
int32_t n; // size of the _p_ array
|
||||
uint64_t *p; // position array for minimizers appearing >1 times
|
||||
void *h; // hash table indexing _p_ and minimizers appearing once
|
||||
} mm_idx_bucket_t;
|
||||
|
||||
// minimap2 index
|
||||
typedef struct {
|
||||
char *name; // name of the db sequence
|
||||
uint64_t offset; // offset in mm_idx_t::S
|
||||
|
|
@ -48,43 +43,50 @@ typedef struct {
|
|||
|
||||
typedef struct {
|
||||
int32_t b, w, k, is_hpc;
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
mm_idx_bucket_t *B; // index
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
struct mm_idx_bucket_s *B; // index (hidden)
|
||||
void *km;
|
||||
} mm_idx_t;
|
||||
|
||||
// minimap2 alignment
|
||||
typedef struct {
|
||||
uint32_t capacity;
|
||||
int32_t dp_score, dp_max, dp_max2;
|
||||
uint32_t blen;
|
||||
uint32_t n_diff;
|
||||
uint32_t n_ambi:30, trans_strand:2;
|
||||
uint32_t n_cigar;
|
||||
uint32_t capacity; // the capacity of cigar[]
|
||||
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
||||
uint32_t blen; // block length
|
||||
uint32_t n_diff; // number of differences, including ambiguous bases
|
||||
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
||||
uint32_t n_cigar; // number of cigar operations in cigar[]
|
||||
uint32_t cigar[];
|
||||
} mm_extra_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t id;
|
||||
uint32_t cnt:31, rev:1;
|
||||
uint32_t rid:31, inv:1;
|
||||
int32_t score;
|
||||
int32_t qs, qe, rs, re;
|
||||
int32_t parent, subsc;
|
||||
int32_t as;
|
||||
int32_t fuzzy_mlen, fuzzy_blen;
|
||||
uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // TODO: n_sub is not used for now
|
||||
int32_t id; // ID for internal uses (see also parent below)
|
||||
uint32_t cnt:31, rev:1; // number of minimizers; if on the reverse strand
|
||||
uint32_t rid:31, inv:1; // reference index; if this is an alignment from inversion rescue
|
||||
int32_t score; // DP alignment score
|
||||
int32_t qs, qe, rs, re; // query start and end; reference start and end
|
||||
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
|
||||
int32_t as; // offset in the a[] array (for internal uses only)
|
||||
int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate)
|
||||
uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings
|
||||
mm_extra_t *p;
|
||||
} mm_reg1_t;
|
||||
|
||||
// indexing and mapping options
|
||||
typedef struct {
|
||||
float max_occ_frac;
|
||||
float mid_occ_frac;
|
||||
int sdust_thres; // score threshold for SDUST; 0 to disable
|
||||
int flag; // see MM_F_* macros
|
||||
short k, w, is_hpc, bucket_bits;
|
||||
int mini_batch_size;
|
||||
uint64_t batch_size;
|
||||
} mm_idxopt_t;
|
||||
|
||||
int bw; // bandwidth
|
||||
typedef struct {
|
||||
float mid_occ_frac;
|
||||
int sdust_thres; // score threshold for SDUST; 0 to disable
|
||||
int flag; // see MM_F_* macros
|
||||
|
||||
int bw; // bandwidth
|
||||
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
|
||||
int max_chain_skip;
|
||||
int min_cnt;
|
||||
|
|
@ -104,63 +106,44 @@ typedef struct {
|
|||
int min_ksw_len;
|
||||
|
||||
int mini_batch_size;
|
||||
int32_t max_occ, mid_occ;
|
||||
int32_t mid_occ;
|
||||
} mm_mapopt_t;
|
||||
|
||||
typedef struct {
|
||||
short k, w, is_hpc, bucket_bits;
|
||||
int mini_batch_size;
|
||||
uint64_t batch_size;
|
||||
} mm_idxopt_t;
|
||||
|
||||
struct mm_bseq_file_s;
|
||||
|
||||
// index reader
|
||||
typedef struct {
|
||||
int is_idx, n_parts;
|
||||
mm_idxopt_t opt;
|
||||
FILE *fp_out;
|
||||
union {
|
||||
struct mm_bseq_file_s *seq;
|
||||
FILE *idx;
|
||||
} fp;
|
||||
} mm_idx_reader_t;
|
||||
|
||||
extern int mm_verbose, mm_dbg_flag;
|
||||
extern double mm_realtime0;
|
||||
|
||||
struct mm_tbuf_s;
|
||||
// memory buffer for thread-local storage during mapping
|
||||
typedef struct mm_tbuf_s mm_tbuf_t;
|
||||
|
||||
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
|
||||
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
|
||||
// global variables
|
||||
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
|
||||
extern double mm_realtime0; // wall-clock timer
|
||||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt);
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
|
||||
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
|
||||
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
|
||||
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt);
|
||||
int mm_idx_reader_is_idx(const mm_idx_reader_t *r);
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
|
||||
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
|
||||
void mm_idx_reader_close(mm_idx_reader_t *r);
|
||||
void mm_idx_stat(const mm_idx_t *idx);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
void mm_idx_destroy(mm_idx_t *mi);
|
||||
|
||||
// minimizer indexing
|
||||
void mm_idx_stat(const mm_idx_t *idx);
|
||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
|
||||
// minimizer index I/O
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
|
||||
mm_idx_t *mm_idx_load(FILE *fp);
|
||||
|
||||
// mapping
|
||||
mm_tbuf_t *mm_tbuf_init(void);
|
||||
void mm_tbuf_destroy(mm_tbuf_t *b);
|
||||
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
|
||||
|
||||
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
|
||||
|
||||
// obsolete APIs (for backward compatibility)
|
||||
// deprecated APIs for backward compatibility
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
5
mmpriv.h
5
mmpriv.h
|
|
@ -21,6 +21,9 @@
|
|||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
|
||||
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -47,6 +50,8 @@ void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *arg
|
|||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt);
|
||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
||||
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
|
||||
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
|
|
|||
Loading…
Reference in New Issue