Merge branch 'split'

This commit is contained in:
Heng Li 2018-07-15 22:24:15 -04:00
commit 0e137670fc
11 changed files with 329 additions and 57 deletions

View File

@ -1,7 +1,7 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o ksw2_ll_sse.o
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread

View File

@ -199,12 +199,12 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
mm_extra_t *p;
if (n_cigar == 0) return;
if (r->p == 0) {
uint32_t capacity = n_cigar + sizeof(mm_extra_t); // TODO: should this be "n_cigar + sizeof(mm_extra_t)/4" instead?
uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4;
kroundup32(capacity);
r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity;
} else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t) > r->p->capacity) {
r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t);
} else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4 > r->p->capacity) {
r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4;
kroundup32(r->p->capacity);
r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4);
}
@ -878,6 +878,6 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
kfree(km, qseq0[0]);
kfree(km, ez.cigar);
mm_filter_regs(opt, qlen, n_regs_, regs);
mm_hit_sort_by_dp(km, n_regs_, regs);
mm_hit_sort(km, n_regs_, regs);
return regs;
}

14
hit.c
View File

@ -164,9 +164,9 @@ set_parent_test:
kfree(km, w);
}
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r)
{
int32_t i, n_aux, n = *n_regs;
int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
mm128_t *aux;
mm_reg1_t *t;
@ -175,14 +175,20 @@ void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
for (i = n_aux = 0; i < n; ++i) {
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
assert(r[i].p);
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
if (r[i].p) {
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
has_cigar = 1;
} else {
aux[n_aux].x = (uint64_t)r[i].score << 32 | r[i].hash;
no_cigar = 1;
}
aux[n_aux++].y = i;
} else if (r[i].p) {
free(r[i].p);
r[i].p = 0;
}
}
assert(has_cigar + no_cigar == 1);
radix_sort_128x(aux, aux + n_aux);
for (i = n_aux - 1; i >= 0; --i)
t[n_aux - 1 - i] = r[aux[i].y];

12
index.c
View File

@ -48,10 +48,12 @@ void mm_idx_destroy(mm_idx_t *mi)
uint32_t i;
if (mi == 0) return;
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
for (i = 0; i < 1U<<mi->b; ++i) {
free(mi->B[i].p);
free(mi->B[i].a.a);
kh_destroy(idx, (idxhash_t*)mi->B[i].h);
if (mi->B) {
for (i = 0; i < 1U<<mi->b; ++i) {
free(mi->B[i].p);
free(mi->B[i].a.a);
kh_destroy(idx, (idxhash_t*)mi->B[i].h);
}
}
if (!mi->km) {
for (i = 0; i < mi->n_seq; ++i)
@ -563,7 +565,7 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
if (mi) {
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
++r->n_parts;
mi->index = r->n_parts++;
}
return mi;
}

14
main.c
View File

@ -10,7 +10,7 @@
#include "getopt.h"
#endif
#define MM_VERSION "2.11-r797"
#define MM_VERSION "2.11-r815-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -61,6 +61,7 @@ static struct option long_options[] = {
{ "score-N", required_argument, 0, 0 }, // 31
{ "eqx", no_argument, 0, 0 }, // 32
{ "paf-no-hit", no_argument, 0, 0 }, // 33
{ "split-prefix", required_argument, 0, 0 }, // 34
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -101,7 +102,7 @@ int main(int argc, char *argv[])
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yY";
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx;
int i, c, n_threads = 3, n_parts, long_idx;
char *fnw = 0, *rg = 0, *s;
FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
@ -181,6 +182,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==31) opt.sc_ambi = atoi(optarg); // --score-N
else if (c == 0 && long_idx ==32) opt.flag |= MM_F_EQX; // --eqx
else if (c == 0 && long_idx ==33) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit
else if (c == 0 && long_idx ==34) opt.split_prefix = optarg; // --split-prefix
else if (c == 0 && long_idx == 14) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1);
} else if (c == 0 && long_idx == 15) { // --secondary
@ -327,8 +329,8 @@ int main(int argc, char *argv[])
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);
} else {
mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv);
if (mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m For a multi-part index, no @SQ lines will be outputted.\033[0m\n");
if (opt.split_prefix == 0 && mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m For a multi-part index, no @SQ lines will be outputted. Please use --split-prefix.\033[0m\n");
}
}
if (mm_verbose >= 3)
@ -344,8 +346,12 @@ int main(int argc, char *argv[])
}
mm_idx_destroy(mi);
}
n_parts = idx_rdr->n_parts;
mm_idx_reader_close(idx_rdr);
if (opt.split_prefix)
mm_split_merge(argc - (optind + 1), (const char**)&argv[optind + 1], &opt, n_parts);
if (fflush(stdout) == EOF) {
fprintf(stderr, "[ERROR] failed to write the results\n");
exit(EXIT_FAILURE);

216
map.c
View File

@ -11,6 +11,7 @@
struct mm_tbuf_s {
void *km;
int rep_len, frag_gap;
};
mm_tbuf_t *mm_tbuf_init(void)
@ -330,6 +331,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
}
b->frag_gap = max_chain_gap_ref;
b->rep_len = rep_len;
regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a);
@ -394,13 +397,17 @@ typedef struct {
mm_bseq_file_t **fp;
const mm_idx_t *mi;
kstring_t str;
int n_parts;
uint32_t *rid_shift;
FILE *fp_split, **fp_parts;
} pipeline_t;
typedef struct {
const pipeline_t *p;
int n_seq, n_frag;
mm_bseq1_t *seq;
int *n_reg, *seg_off, *n_seg;
int *n_reg, *seg_off, *n_seg, *rep_len, *frag_gap;
mm_reg1_t **reg;
mm_tbuf_t **buf;
} step_t;
@ -421,10 +428,17 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
qseqs[j] = s->seq[off + j].seq;
}
if (s->p->opt->flag & MM_F_INDEPEND_SEG) {
for (j = 0; j < s->n_seg[i]; ++j)
for (j = 0; j < s->n_seg[i]; ++j) {
mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
s->rep_len[off + j] = b->rep_len;
s->frag_gap[off + j] = b->frag_gap;
}
} else {
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
for (j = 0; j < s->n_seg[i]; ++j) {
s->rep_len[off + j] = b->rep_len;
s->frag_gap[off + j] = b->frag_gap;
}
}
for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
@ -440,6 +454,63 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
}
}
static void merge_hits(step_t *s)
{
int f, i, k0, k, max_seg = 0, *n_reg_part, *rep_len_part, *frag_gap_part, *qlens;
void *km;
FILE **fp = s->p->fp_parts;
const mm_mapopt_t *opt = s->p->opt;
km = km_init();
for (f = 0; f < s->n_frag; ++f)
max_seg = max_seg > s->n_seg[f]? max_seg : s->n_seg[f];
qlens = CALLOC(int, max_seg + s->p->n_parts * 3);
n_reg_part = qlens + max_seg;
rep_len_part = n_reg_part + s->p->n_parts;
frag_gap_part = rep_len_part + s->p->n_parts;
for (f = 0, k = k0 = 0; f < s->n_frag; ++f) {
k0 = k;
for (i = 0; i < s->n_seg[f]; ++i, ++k) {
int j, l, t, rep_len = 0;
qlens[i] = s->seq[k].l_seq;
for (j = 0, s->n_reg[k] = 0; j < s->p->n_parts; ++j) {
mm_err_fread(&n_reg_part[j], sizeof(int), 1, fp[j]);
mm_err_fread(&rep_len_part[j], sizeof(int), 1, fp[j]);
mm_err_fread(&frag_gap_part[j], sizeof(int), 1, fp[j]);
s->n_reg[k] += n_reg_part[j];
if (rep_len < rep_len_part[j])
rep_len = rep_len_part[j];
}
s->reg[k] = CALLOC(mm_reg1_t, s->n_reg[k]);
for (j = 0, l = 0; j < s->p->n_parts; ++j) {
for (t = 0; t < n_reg_part[j]; ++t, ++l) {
mm_reg1_t *r = &s->reg[k][l];
uint32_t capacity;
mm_err_fread(r, sizeof(mm_reg1_t), 1, fp[j]);
r->rid += s->p->rid_shift[j];
if (opt->flag & MM_F_CIGAR) {
mm_err_fread(&capacity, 4, 1, fp[j]);
r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity;
mm_err_fread(r->p, r->p->capacity, 4, fp[j]);
}
}
}
mm_hit_sort(km, &s->n_reg[k], s->reg[k]);
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b);
if (!(opt->flag & MM_F_ALL_CHAINS)) {
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
mm_set_sam_pri(s->n_reg[k], s->reg[k]);
}
mm_set_mapq(km, s->n_reg[k], s->reg[k], opt->min_chain_score, opt->a, rep_len, !!(opt->flag & MM_F_SR));
}
if (s->n_seg[f] == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
mm_pair(km, frag_gap_part[0], opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, &s->n_reg[k0], &s->reg[k0]);
}
free(qlens);
km_destroy(km);
}
static void *worker_pipeline(void *shared, int step, void *in)
{
int i, j, k;
@ -459,9 +530,11 @@ static void *worker_pipeline(void *shared, int step, void *in)
s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*));
for (i = 0; i < p->n_threads; ++i)
s->buf[i] = mm_tbuf_init();
s->n_reg = (int*)calloc(3 * s->n_seq, sizeof(int));
s->seg_off = s->n_reg + s->n_seq; // seg_off and n_seg are allocated together with n_reg
s->n_reg = (int*)calloc(5 * s->n_seq, sizeof(int));
s->seg_off = s->n_reg + s->n_seq; // seg_off, n_seg, rep_len and frag_gap are allocated together with n_reg
s->n_seg = s->seg_off + s->n_seq;
s->rep_len = s->n_seg + s->n_seq;
s->frag_gap = s->rep_len + s->n_seq;
s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*));
for (i = 1, j = 0; i <= s->n_seq; ++i)
if (i == s->n_seq || !frag_mode || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) {
@ -472,7 +545,8 @@ static void *worker_pipeline(void *shared, int step, void *in)
return s;
} else free(s);
} else if (step == 1) { // step 1: map
kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
if (p->n_parts > 0) merge_hits((step_t*)in);
else kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
return in;
} else if (step == 2) { // step 2: output
void *km = 0;
@ -485,25 +559,36 @@ static void *worker_pipeline(void *shared, int step, void *in)
int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
for (i = seg_st; i < seg_en; ++i) {
mm_bseq1_t *t = &s->seq[i];
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
assert(!r->sam_pri || r->id == r->parent);
if ((p->opt->flag & MM_F_NO_PRINT_2ND) && r->id != r->parent)
continue;
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam2(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
else
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
mm_err_puts(p->str.s);
}
if (s->n_reg[i] == 0) {
if (p->opt->flag & MM_F_OUT_SAM) {
mm_write_sam2(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
mm_err_puts(p->str.s);
} else if (p->opt->flag & MM_F_PAF_NO_HIT) {
mm_write_paf(&p->str, mi, t, 0, 0, p->opt->flag);
if (p->opt->split_prefix && p->n_parts == 0) { // then write to temporary files
mm_err_fwrite(&s->n_reg[i], sizeof(int), 1, p->fp_split);
mm_err_fwrite(&s->rep_len[i], sizeof(int), 1, p->fp_split);
mm_err_fwrite(&s->frag_gap[i], sizeof(int), 1, p->fp_split);
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
mm_err_fwrite(r, sizeof(mm_reg1_t), 1, p->fp_split);
if (p->opt->flag & MM_F_CIGAR) {
mm_err_fwrite(&r->p->capacity, 4, 1, p->fp_split);
mm_err_fwrite(r->p, r->p->capacity, 4, p->fp_split);
}
}
} else if (s->n_reg[i] > 0) { // the query has at least one hit
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
assert(!r->sam_pri || r->id == r->parent);
if ((p->opt->flag & MM_F_NO_PRINT_2ND) && r->id != r->parent)
continue;
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam2(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
else
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
mm_err_puts(p->str.s);
}
} else if (p->opt->flag & (MM_F_OUT_SAM|MM_F_PAF_NO_HIT)) { // output an empty hit, if requested
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam2(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
else
mm_write_paf(&p->str, mi, t, 0, 0, p->opt->flag);
mm_err_puts(p->str.s);
}
}
for (i = seg_st; i < seg_en; ++i) {
@ -513,7 +598,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
if (s->seq[i].qual) free(s->seq[i].qual);
}
}
free(s->reg); free(s->n_reg); free(s->seq); // seg_off and n_seg were allocated with reg; no memory leak here
free(s->reg); free(s->n_reg); free(s->seq); // seg_off, n_seg, rep_len and frag_gap were allocated with reg; no memory leak here
km_destroy(km);
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
@ -522,32 +607,44 @@ static void *worker_pipeline(void *shared, int step, void *in)
return 0;
}
static mm_bseq_file_t **open_bseqs(int n, const char **fn)
{
mm_bseq_file_t **fp;
int i, j;
fp = (mm_bseq_file_t**)calloc(n, sizeof(mm_bseq_file_t*));
for (i = 0; i < n; ++i) {
if ((fp[i] = mm_bseq_open(fn[i])) == 0) {
if (mm_verbose >= 1)
fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
for (j = 0; j < i; ++j)
mm_bseq_close(fp[j]);
free(fp);
return 0;
}
}
return fp;
}
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads)
{
int i, j, pl_threads;
int i, pl_threads;
pipeline_t pl;
if (n_segs < 1) return -1;
memset(&pl, 0, sizeof(pipeline_t));
pl.n_fp = n_segs;
pl.fp = (mm_bseq_file_t**)calloc(n_segs, sizeof(mm_bseq_file_t*));
for (i = 0; i < n_segs; ++i) {
pl.fp[i] = mm_bseq_open(fn[i]);
if (pl.fp[i] == 0) {
if (mm_verbose >= 1)
fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
for (j = 0; j < i; ++j)
mm_bseq_close(pl.fp[j]);
free(pl.fp);
return -1;
}
}
pl.fp = open_bseqs(pl.n_fp, fn);
if (pl.fp == 0) return -1;
pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads > 1? n_threads : 1;
pl.mini_batch_size = opt->mini_batch_size;
if (opt->split_prefix)
pl.fp_split = mm_split_init(opt->split_prefix, idx);
pl_threads = n_threads == 1? 1 : (opt->flag&MM_F_2_IO_THREADS)? 3 : 2;
kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
free(pl.str.s);
for (i = 0; i < n_segs; ++i)
if (pl.fp_split) fclose(pl.fp_split);
for (i = 0; i < pl.n_fp; ++i)
mm_bseq_close(pl.fp[i]);
free(pl.fp);
return 0;
@ -557,3 +654,48 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
{
return mm_map_file_frag(idx, 1, &fn, opt, n_threads);
}
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx)
{
int i;
pipeline_t pl;
mm_idx_t *mi;
if (n_segs < 1 || n_split_idx < 1) return -1;
memset(&pl, 0, sizeof(pipeline_t));
pl.n_fp = n_segs;
pl.fp = open_bseqs(pl.n_fp, fn);
if (pl.fp == 0) return -1;
pl.opt = opt;
pl.mini_batch_size = opt->mini_batch_size;
pl.n_parts = n_split_idx;
pl.fp_parts = CALLOC(FILE*, pl.n_parts);
pl.rid_shift = CALLOC(uint32_t, pl.n_parts);
pl.mi = mi = mm_split_merge_prep(opt->split_prefix, n_split_idx, pl.fp_parts, pl.rid_shift);
if (pl.mi == 0) {
free(pl.fp_parts);
free(pl.rid_shift);
return -1;
}
for (i = n_split_idx - 1; i > 0; --i)
pl.rid_shift[i] = pl.rid_shift[i - 1];
for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
pl.rid_shift[i] += pl.rid_shift[i - 1];
if (opt->flag & MM_F_OUT_SAM)
for (i = 0; i < pl.mi->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len);
kt_pipeline(2, worker_pipeline, &pl, 3);
free(pl.str.s);
mm_idx_destroy(mi);
free(pl.rid_shift);
for (i = 0; i < n_split_idx; ++i)
fclose(pl.fp_parts[i]);
free(pl.fp_parts);
for (i = 0; i < pl.n_fp; ++i)
mm_bseq_close(pl.fp[i]);
free(pl.fp);
mm_split_rm_tmp(opt->split_prefix, n_split_idx);
return 0;
}

View File

@ -60,6 +60,7 @@ typedef struct {
typedef struct {
int32_t b, w, k, flag;
uint32_t n_seq; // number of reference sequences
int32_t index;
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
@ -136,6 +137,8 @@ typedef struct {
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ;
int mini_batch_size; // size of a batch of query bases to process in parallel
const char *split_prefix;
} mm_mapopt_t;
// index reader

View File

@ -239,7 +239,7 @@ Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by
.BR -r .
.TP
.B --lj-min-ratio \ FLOAT
.BI --lj-min-ratio \ FLOAT
Fraction of query sequence length required to bridge a long gap [0.5]. A
smaller value helps to recover longer gaps, at the cost of more false gaps.
.TP
@ -252,6 +252,9 @@ applies a second round of chaining with a higher minimizer occurrence threshold
if no good chain is found. In addition, minimap2 attempts to patch gaps between
seeds with ungapped alignment.
.TP
.BI --split-prefix \ STR
Prefix to create temporary files. Typically used for a multi-part index.
.TP
.BR --frag = no | yes
Whether to enable the fragment mode [no]
.TP

20
misc.c
View File

@ -131,6 +131,26 @@ void mm_err_puts(const char *str)
}
}
void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp)
{
int ret;
ret = fwrite(p, size, nitems, fp);
if (ret == EOF) {
fprintf(stderr, "[ERROR] failed to write data\n");
exit(EXIT_FAILURE);
}
}
void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp)
{
int ret;
ret = fread(p, size, nitems, fp);
if (ret == EOF) {
fprintf(stderr, "[ERROR] failed to read data\n");
exit(EXIT_FAILURE);
}
}
#include "ksort.h"
#define sort_key_128x(a) ((a).x)

View File

@ -28,6 +28,9 @@
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
#define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
#ifdef __cplusplus
extern "C" {
#endif
@ -77,7 +80,7 @@ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_,
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r);
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r);
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
@ -86,7 +89,14 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
void mm_split_rm_tmp(const char *prefix, int n_splits);
void mm_err_puts(const char *str);
void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp);
void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp);
#ifdef __cplusplus
}

80
splitidx.c 100644
View File

@ -0,0 +1,80 @@
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include "mmpriv.h"
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi)
{
char *fn;
FILE *fp;
uint32_t i, k = mi->k;
fn = (char*)calloc(strlen(prefix) + 10, 1);
sprintf(fn, "%s.%.4d.tmp", prefix, mi->index);
fp = fopen(fn, "wb");
assert(fp);
mm_err_fwrite(&k, 4, 1, fp);
mm_err_fwrite(&mi->n_seq, 4, 1, fp);
for (i = 0; i < mi->n_seq; ++i) {
uint8_t l;
l = strlen(mi->seq[i].name);
mm_err_fwrite(&l, 1, 1, fp);
mm_err_fwrite(mi->seq[i].name, 1, l, fp);
mm_err_fwrite(&mi->seq[i].len, 4, 1, fp);
}
free(fn);
return fp;
}
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part)
{
mm_idx_t *mi = 0;
char *fn;
int i, j;
if (n_splits < 1) return 0;
fn = CALLOC(char, strlen(prefix) + 10);
for (i = 0; i < n_splits; ++i) {
sprintf(fn, "%s.%.4d.tmp", prefix, i);
if ((fp[i] = fopen(fn, "rb")) == 0) {
if (mm_verbose >= 1)
fprintf(stderr, "ERROR: failed to open temporary file '%s'\n", fn);
for (j = 0; j < i; ++j)
fclose(fp[j]);
free(fn);
return 0;
}
}
free(fn);
mi = CALLOC(mm_idx_t, 1);
for (i = 0; i < n_splits; ++i) {
mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same
mm_err_fread(&n_seq_part[i], 4, 1, fp[i]);
mi->n_seq += n_seq_part[i];
}
mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq);
for (i = j = 0; i < n_splits; ++i) {
uint32_t k;
for (k = 0; k < n_seq_part[i]; ++k, ++j) {
uint8_t l;
mm_err_fread(&l, 1, 1, fp[i]);
mi->seq[j].name = (char*)calloc(l + 1, 1);
mm_err_fread(mi->seq[j].name, 1, l, fp[i]);
mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]);
}
}
return mi;
}
void mm_split_rm_tmp(const char *prefix, int n_splits)
{
int i;
char *fn;
fn = CALLOC(char, strlen(prefix) + 10);
for (i = 0; i < n_splits; ++i) {
sprintf(fn, "%s.%.4d.tmp", prefix, i);
remove(fn);
}
free(fn);
}