Merge branch 'split'

This commit is contained in:
Heng Li 2018-07-15 22:24:15 -04:00
commit 0e137670fc
11 changed files with 329 additions and 57 deletions

View File

@ -1,7 +1,7 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC CPPFLAGS= -DHAVE_KALLOC
INCLUDES= INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o ksw2_ll_sse.o OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
PROG= minimap2 PROG= minimap2
PROG_EXTRA= sdust minimap2-lite PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread LIBS= -lm -lz -lpthread

View File

@ -199,12 +199,12 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
mm_extra_t *p; mm_extra_t *p;
if (n_cigar == 0) return; if (n_cigar == 0) return;
if (r->p == 0) { if (r->p == 0) {
uint32_t capacity = n_cigar + sizeof(mm_extra_t); // TODO: should this be "n_cigar + sizeof(mm_extra_t)/4" instead? uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4;
kroundup32(capacity); kroundup32(capacity);
r->p = (mm_extra_t*)calloc(capacity, 4); r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity; r->p->capacity = capacity;
} else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t) > r->p->capacity) { } else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4 > r->p->capacity) {
r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t); r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4;
kroundup32(r->p->capacity); kroundup32(r->p->capacity);
r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4); r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4);
} }
@ -878,6 +878,6 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
kfree(km, qseq0[0]); kfree(km, qseq0[0]);
kfree(km, ez.cigar); kfree(km, ez.cigar);
mm_filter_regs(opt, qlen, n_regs_, regs); mm_filter_regs(opt, qlen, n_regs_, regs);
mm_hit_sort_by_dp(km, n_regs_, regs); mm_hit_sort(km, n_regs_, regs);
return regs; return regs;
} }

12
hit.c
View File

@ -164,9 +164,9 @@ set_parent_test:
kfree(km, w); kfree(km, w);
} }
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r) void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r)
{ {
int32_t i, n_aux, n = *n_regs; int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
mm128_t *aux; mm128_t *aux;
mm_reg1_t *t; mm_reg1_t *t;
@ -175,14 +175,20 @@ void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t)); t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
for (i = n_aux = 0; i < n; ++i) { for (i = n_aux = 0; i < n; ++i) {
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted) if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
assert(r[i].p); if (r[i].p) {
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash; aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
has_cigar = 1;
} else {
aux[n_aux].x = (uint64_t)r[i].score << 32 | r[i].hash;
no_cigar = 1;
}
aux[n_aux++].y = i; aux[n_aux++].y = i;
} else if (r[i].p) { } else if (r[i].p) {
free(r[i].p); free(r[i].p);
r[i].p = 0; r[i].p = 0;
} }
} }
assert(has_cigar + no_cigar == 1);
radix_sort_128x(aux, aux + n_aux); radix_sort_128x(aux, aux + n_aux);
for (i = n_aux - 1; i >= 0; --i) for (i = n_aux - 1; i >= 0; --i)
t[n_aux - 1 - i] = r[aux[i].y]; t[n_aux - 1 - i] = r[aux[i].y];

View File

@ -48,11 +48,13 @@ void mm_idx_destroy(mm_idx_t *mi)
uint32_t i; uint32_t i;
if (mi == 0) return; if (mi == 0) return;
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h); if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
if (mi->B) {
for (i = 0; i < 1U<<mi->b; ++i) { for (i = 0; i < 1U<<mi->b; ++i) {
free(mi->B[i].p); free(mi->B[i].p);
free(mi->B[i].a.a); free(mi->B[i].a.a);
kh_destroy(idx, (idxhash_t*)mi->B[i].h); kh_destroy(idx, (idxhash_t*)mi->B[i].h);
} }
}
if (!mi->km) { if (!mi->km) {
for (i = 0; i < mi->n_seq; ++i) for (i = 0; i < mi->n_seq; ++i)
free(mi->seq[i].name); free(mi->seq[i].name);
@ -563,7 +565,7 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size); mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
if (mi) { if (mi) {
if (r->fp_out) mm_idx_dump(r->fp_out, mi); if (r->fp_out) mm_idx_dump(r->fp_out, mi);
++r->n_parts; mi->index = r->n_parts++;
} }
return mi; return mi;
} }

14
main.c
View File

@ -10,7 +10,7 @@
#include "getopt.h" #include "getopt.h"
#endif #endif
#define MM_VERSION "2.11-r797" #define MM_VERSION "2.11-r815-dirty"
#ifdef __linux__ #ifdef __linux__
#include <sys/resource.h> #include <sys/resource.h>
@ -61,6 +61,7 @@ static struct option long_options[] = {
{ "score-N", required_argument, 0, 0 }, // 31 { "score-N", required_argument, 0, 0 }, // 31
{ "eqx", no_argument, 0, 0 }, // 32 { "eqx", no_argument, 0, 0 }, // 32
{ "paf-no-hit", no_argument, 0, 0 }, // 33 { "paf-no-hit", no_argument, 0, 0 }, // 33
{ "split-prefix", required_argument, 0, 0 }, // 34
{ "help", no_argument, 0, 'h' }, { "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' }, { "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' }, { "version", no_argument, 0, 'V' },
@ -101,7 +102,7 @@ int main(int argc, char *argv[])
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yY"; const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yY";
mm_mapopt_t opt; mm_mapopt_t opt;
mm_idxopt_t ipt; mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx; int i, c, n_threads = 3, n_parts, long_idx;
char *fnw = 0, *rg = 0, *s; char *fnw = 0, *rg = 0, *s;
FILE *fp_help = stderr; FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr; mm_idx_reader_t *idx_rdr;
@ -181,6 +182,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==31) opt.sc_ambi = atoi(optarg); // --score-N else if (c == 0 && long_idx ==31) opt.sc_ambi = atoi(optarg); // --score-N
else if (c == 0 && long_idx ==32) opt.flag |= MM_F_EQX; // --eqx else if (c == 0 && long_idx ==32) opt.flag |= MM_F_EQX; // --eqx
else if (c == 0 && long_idx ==33) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit else if (c == 0 && long_idx ==33) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit
else if (c == 0 && long_idx ==34) opt.split_prefix = optarg; // --split-prefix
else if (c == 0 && long_idx == 14) { // --frag else if (c == 0 && long_idx == 14) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1); yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1);
} else if (c == 0 && long_idx == 15) { // --secondary } else if (c == 0 && long_idx == 15) { // --secondary
@ -327,8 +329,8 @@ int main(int argc, char *argv[])
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv); mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);
} else { } else {
mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv); mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv);
if (mm_verbose >= 2) if (opt.split_prefix == 0 && mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m For a multi-part index, no @SQ lines will be outputted.\033[0m\n"); fprintf(stderr, "[WARNING]\033[1;31m For a multi-part index, no @SQ lines will be outputted. Please use --split-prefix.\033[0m\n");
} }
} }
if (mm_verbose >= 3) if (mm_verbose >= 3)
@ -344,8 +346,12 @@ int main(int argc, char *argv[])
} }
mm_idx_destroy(mi); mm_idx_destroy(mi);
} }
n_parts = idx_rdr->n_parts;
mm_idx_reader_close(idx_rdr); mm_idx_reader_close(idx_rdr);
if (opt.split_prefix)
mm_split_merge(argc - (optind + 1), (const char**)&argv[optind + 1], &opt, n_parts);
if (fflush(stdout) == EOF) { if (fflush(stdout) == EOF) {
fprintf(stderr, "[ERROR] failed to write the results\n"); fprintf(stderr, "[ERROR] failed to write the results\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);

192
map.c
View File

@ -11,6 +11,7 @@
struct mm_tbuf_s { struct mm_tbuf_s {
void *km; void *km;
int rep_len, frag_gap;
}; };
mm_tbuf_t *mm_tbuf_init(void) mm_tbuf_t *mm_tbuf_init(void)
@ -330,6 +331,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
} }
} }
b->frag_gap = max_chain_gap_ref;
b->rep_len = rep_len;
regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a); regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a);
@ -394,13 +397,17 @@ typedef struct {
mm_bseq_file_t **fp; mm_bseq_file_t **fp;
const mm_idx_t *mi; const mm_idx_t *mi;
kstring_t str; kstring_t str;
int n_parts;
uint32_t *rid_shift;
FILE *fp_split, **fp_parts;
} pipeline_t; } pipeline_t;
typedef struct { typedef struct {
const pipeline_t *p; const pipeline_t *p;
int n_seq, n_frag; int n_seq, n_frag;
mm_bseq1_t *seq; mm_bseq1_t *seq;
int *n_reg, *seg_off, *n_seg; int *n_reg, *seg_off, *n_seg, *rep_len, *frag_gap;
mm_reg1_t **reg; mm_reg1_t **reg;
mm_tbuf_t **buf; mm_tbuf_t **buf;
} step_t; } step_t;
@ -421,10 +428,17 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
qseqs[j] = s->seq[off + j].seq; qseqs[j] = s->seq[off + j].seq;
} }
if (s->p->opt->flag & MM_F_INDEPEND_SEG) { if (s->p->opt->flag & MM_F_INDEPEND_SEG) {
for (j = 0; j < s->n_seg[i]; ++j) for (j = 0; j < s->n_seg[i]; ++j) {
mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name); mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
s->rep_len[off + j] = b->rep_len;
s->frag_gap[off + j] = b->frag_gap;
}
} else { } else {
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
for (j = 0; j < s->n_seg[i]; ++j) {
s->rep_len[off + j] = b->rep_len;
s->frag_gap[off + j] = b->frag_gap;
}
} }
for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) { if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
@ -440,6 +454,63 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
} }
} }
static void merge_hits(step_t *s)
{
int f, i, k0, k, max_seg = 0, *n_reg_part, *rep_len_part, *frag_gap_part, *qlens;
void *km;
FILE **fp = s->p->fp_parts;
const mm_mapopt_t *opt = s->p->opt;
km = km_init();
for (f = 0; f < s->n_frag; ++f)
max_seg = max_seg > s->n_seg[f]? max_seg : s->n_seg[f];
qlens = CALLOC(int, max_seg + s->p->n_parts * 3);
n_reg_part = qlens + max_seg;
rep_len_part = n_reg_part + s->p->n_parts;
frag_gap_part = rep_len_part + s->p->n_parts;
for (f = 0, k = k0 = 0; f < s->n_frag; ++f) {
k0 = k;
for (i = 0; i < s->n_seg[f]; ++i, ++k) {
int j, l, t, rep_len = 0;
qlens[i] = s->seq[k].l_seq;
for (j = 0, s->n_reg[k] = 0; j < s->p->n_parts; ++j) {
mm_err_fread(&n_reg_part[j], sizeof(int), 1, fp[j]);
mm_err_fread(&rep_len_part[j], sizeof(int), 1, fp[j]);
mm_err_fread(&frag_gap_part[j], sizeof(int), 1, fp[j]);
s->n_reg[k] += n_reg_part[j];
if (rep_len < rep_len_part[j])
rep_len = rep_len_part[j];
}
s->reg[k] = CALLOC(mm_reg1_t, s->n_reg[k]);
for (j = 0, l = 0; j < s->p->n_parts; ++j) {
for (t = 0; t < n_reg_part[j]; ++t, ++l) {
mm_reg1_t *r = &s->reg[k][l];
uint32_t capacity;
mm_err_fread(r, sizeof(mm_reg1_t), 1, fp[j]);
r->rid += s->p->rid_shift[j];
if (opt->flag & MM_F_CIGAR) {
mm_err_fread(&capacity, 4, 1, fp[j]);
r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity;
mm_err_fread(r->p, r->p->capacity, 4, fp[j]);
}
}
}
mm_hit_sort(km, &s->n_reg[k], s->reg[k]);
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b);
if (!(opt->flag & MM_F_ALL_CHAINS)) {
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
mm_set_sam_pri(s->n_reg[k], s->reg[k]);
}
mm_set_mapq(km, s->n_reg[k], s->reg[k], opt->min_chain_score, opt->a, rep_len, !!(opt->flag & MM_F_SR));
}
if (s->n_seg[f] == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
mm_pair(km, frag_gap_part[0], opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, &s->n_reg[k0], &s->reg[k0]);
}
free(qlens);
km_destroy(km);
}
static void *worker_pipeline(void *shared, int step, void *in) static void *worker_pipeline(void *shared, int step, void *in)
{ {
int i, j, k; int i, j, k;
@ -459,9 +530,11 @@ static void *worker_pipeline(void *shared, int step, void *in)
s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*)); s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*));
for (i = 0; i < p->n_threads; ++i) for (i = 0; i < p->n_threads; ++i)
s->buf[i] = mm_tbuf_init(); s->buf[i] = mm_tbuf_init();
s->n_reg = (int*)calloc(3 * s->n_seq, sizeof(int)); s->n_reg = (int*)calloc(5 * s->n_seq, sizeof(int));
s->seg_off = s->n_reg + s->n_seq; // seg_off and n_seg are allocated together with n_reg s->seg_off = s->n_reg + s->n_seq; // seg_off, n_seg, rep_len and frag_gap are allocated together with n_reg
s->n_seg = s->seg_off + s->n_seq; s->n_seg = s->seg_off + s->n_seq;
s->rep_len = s->n_seg + s->n_seq;
s->frag_gap = s->rep_len + s->n_seq;
s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*)); s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*));
for (i = 1, j = 0; i <= s->n_seq; ++i) for (i = 1, j = 0; i <= s->n_seq; ++i)
if (i == s->n_seq || !frag_mode || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) { if (i == s->n_seq || !frag_mode || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) {
@ -472,7 +545,8 @@ static void *worker_pipeline(void *shared, int step, void *in)
return s; return s;
} else free(s); } else free(s);
} else if (step == 1) { // step 1: map } else if (step == 1) { // step 1: map
kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag); if (p->n_parts > 0) merge_hits((step_t*)in);
else kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
return in; return in;
} else if (step == 2) { // step 2: output } else if (step == 2) { // step 2: output
void *km = 0; void *km = 0;
@ -485,6 +559,19 @@ static void *worker_pipeline(void *shared, int step, void *in)
int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k]; int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
for (i = seg_st; i < seg_en; ++i) { for (i = seg_st; i < seg_en; ++i) {
mm_bseq1_t *t = &s->seq[i]; mm_bseq1_t *t = &s->seq[i];
if (p->opt->split_prefix && p->n_parts == 0) { // then write to temporary files
mm_err_fwrite(&s->n_reg[i], sizeof(int), 1, p->fp_split);
mm_err_fwrite(&s->rep_len[i], sizeof(int), 1, p->fp_split);
mm_err_fwrite(&s->frag_gap[i], sizeof(int), 1, p->fp_split);
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
mm_err_fwrite(r, sizeof(mm_reg1_t), 1, p->fp_split);
if (p->opt->flag & MM_F_CIGAR) {
mm_err_fwrite(&r->p->capacity, 4, 1, p->fp_split);
mm_err_fwrite(r->p, r->p->capacity, 4, p->fp_split);
}
}
} else if (s->n_reg[i] > 0) { // the query has at least one hit
for (j = 0; j < s->n_reg[i]; ++j) { for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j]; mm_reg1_t *r = &s->reg[i][j];
assert(!r->sam_pri || r->id == r->parent); assert(!r->sam_pri || r->id == r->parent);
@ -496,16 +583,14 @@ static void *worker_pipeline(void *shared, int step, void *in)
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
mm_err_puts(p->str.s); mm_err_puts(p->str.s);
} }
if (s->n_reg[i] == 0) { } else if (p->opt->flag & (MM_F_OUT_SAM|MM_F_PAF_NO_HIT)) { // output an empty hit, if requested
if (p->opt->flag & MM_F_OUT_SAM) { if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam2(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag); mm_write_sam2(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
mm_err_puts(p->str.s); else
} else if (p->opt->flag & MM_F_PAF_NO_HIT) {
mm_write_paf(&p->str, mi, t, 0, 0, p->opt->flag); mm_write_paf(&p->str, mi, t, 0, 0, p->opt->flag);
mm_err_puts(p->str.s); mm_err_puts(p->str.s);
} }
} }
}
for (i = seg_st; i < seg_en; ++i) { for (i = seg_st; i < seg_en; ++i) {
for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p); for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p);
free(s->reg[i]); free(s->reg[i]);
@ -513,7 +598,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
if (s->seq[i].qual) free(s->seq[i].qual); if (s->seq[i].qual) free(s->seq[i].qual);
} }
} }
free(s->reg); free(s->n_reg); free(s->seq); // seg_off and n_seg were allocated with reg; no memory leak here free(s->reg); free(s->n_reg); free(s->seq); // seg_off, n_seg, rep_len and frag_gap were allocated with reg; no memory leak here
km_destroy(km); km_destroy(km);
if (mm_verbose >= 3) if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq); fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
@ -522,32 +607,44 @@ static void *worker_pipeline(void *shared, int step, void *in)
return 0; return 0;
} }
static mm_bseq_file_t **open_bseqs(int n, const char **fn)
{
mm_bseq_file_t **fp;
int i, j;
fp = (mm_bseq_file_t**)calloc(n, sizeof(mm_bseq_file_t*));
for (i = 0; i < n; ++i) {
if ((fp[i] = mm_bseq_open(fn[i])) == 0) {
if (mm_verbose >= 1)
fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
for (j = 0; j < i; ++j)
mm_bseq_close(fp[j]);
free(fp);
return 0;
}
}
return fp;
}
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads) int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads)
{ {
int i, j, pl_threads; int i, pl_threads;
pipeline_t pl; pipeline_t pl;
if (n_segs < 1) return -1; if (n_segs < 1) return -1;
memset(&pl, 0, sizeof(pipeline_t)); memset(&pl, 0, sizeof(pipeline_t));
pl.n_fp = n_segs; pl.n_fp = n_segs;
pl.fp = (mm_bseq_file_t**)calloc(n_segs, sizeof(mm_bseq_file_t*)); pl.fp = open_bseqs(pl.n_fp, fn);
for (i = 0; i < n_segs; ++i) { if (pl.fp == 0) return -1;
pl.fp[i] = mm_bseq_open(fn[i]);
if (pl.fp[i] == 0) {
if (mm_verbose >= 1)
fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
for (j = 0; j < i; ++j)
mm_bseq_close(pl.fp[j]);
free(pl.fp);
return -1;
}
}
pl.opt = opt, pl.mi = idx; pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads > 1? n_threads : 1; pl.n_threads = n_threads > 1? n_threads : 1;
pl.mini_batch_size = opt->mini_batch_size; pl.mini_batch_size = opt->mini_batch_size;
if (opt->split_prefix)
pl.fp_split = mm_split_init(opt->split_prefix, idx);
pl_threads = n_threads == 1? 1 : (opt->flag&MM_F_2_IO_THREADS)? 3 : 2; pl_threads = n_threads == 1? 1 : (opt->flag&MM_F_2_IO_THREADS)? 3 : 2;
kt_pipeline(pl_threads, worker_pipeline, &pl, 3); kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
free(pl.str.s); free(pl.str.s);
for (i = 0; i < n_segs; ++i) if (pl.fp_split) fclose(pl.fp_split);
for (i = 0; i < pl.n_fp; ++i)
mm_bseq_close(pl.fp[i]); mm_bseq_close(pl.fp[i]);
free(pl.fp); free(pl.fp);
return 0; return 0;
@ -557,3 +654,48 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
{ {
return mm_map_file_frag(idx, 1, &fn, opt, n_threads); return mm_map_file_frag(idx, 1, &fn, opt, n_threads);
} }
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx)
{
int i;
pipeline_t pl;
mm_idx_t *mi;
if (n_segs < 1 || n_split_idx < 1) return -1;
memset(&pl, 0, sizeof(pipeline_t));
pl.n_fp = n_segs;
pl.fp = open_bseqs(pl.n_fp, fn);
if (pl.fp == 0) return -1;
pl.opt = opt;
pl.mini_batch_size = opt->mini_batch_size;
pl.n_parts = n_split_idx;
pl.fp_parts = CALLOC(FILE*, pl.n_parts);
pl.rid_shift = CALLOC(uint32_t, pl.n_parts);
pl.mi = mi = mm_split_merge_prep(opt->split_prefix, n_split_idx, pl.fp_parts, pl.rid_shift);
if (pl.mi == 0) {
free(pl.fp_parts);
free(pl.rid_shift);
return -1;
}
for (i = n_split_idx - 1; i > 0; --i)
pl.rid_shift[i] = pl.rid_shift[i - 1];
for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
pl.rid_shift[i] += pl.rid_shift[i - 1];
if (opt->flag & MM_F_OUT_SAM)
for (i = 0; i < pl.mi->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len);
kt_pipeline(2, worker_pipeline, &pl, 3);
free(pl.str.s);
mm_idx_destroy(mi);
free(pl.rid_shift);
for (i = 0; i < n_split_idx; ++i)
fclose(pl.fp_parts[i]);
free(pl.fp_parts);
for (i = 0; i < pl.n_fp; ++i)
mm_bseq_close(pl.fp[i]);
free(pl.fp);
mm_split_rm_tmp(opt->split_prefix, n_split_idx);
return 0;
}

View File

@ -60,6 +60,7 @@ typedef struct {
typedef struct { typedef struct {
int32_t b, w, k, flag; int32_t b, w, k, flag;
uint32_t n_seq; // number of reference sequences uint32_t n_seq; // number of reference sequences
int32_t index;
mm_idx_seq_t *seq; // sequence name, length and offset mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden) struct mm_idx_bucket_s *B; // index (hidden)
@ -136,6 +137,8 @@ typedef struct {
int32_t mid_occ; // ignore seeds with occurrences above this threshold int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ; int32_t max_occ;
int mini_batch_size; // size of a batch of query bases to process in parallel int mini_batch_size; // size of a batch of query bases to process in parallel
const char *split_prefix;
} mm_mapopt_t; } mm_mapopt_t;
// index reader // index reader

View File

@ -239,7 +239,7 @@ Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by maximum alignment gap is mostly controlled by
.BR -r . .BR -r .
.TP .TP
.B --lj-min-ratio \ FLOAT .BI --lj-min-ratio \ FLOAT
Fraction of query sequence length required to bridge a long gap [0.5]. A Fraction of query sequence length required to bridge a long gap [0.5]. A
smaller value helps to recover longer gaps, at the cost of more false gaps. smaller value helps to recover longer gaps, at the cost of more false gaps.
.TP .TP
@ -252,6 +252,9 @@ applies a second round of chaining with a higher minimizer occurrence threshold
if no good chain is found. In addition, minimap2 attempts to patch gaps between if no good chain is found. In addition, minimap2 attempts to patch gaps between
seeds with ungapped alignment. seeds with ungapped alignment.
.TP .TP
.BI --split-prefix \ STR
Prefix to create temporary files. Typically used for a multi-part index.
.TP
.BR --frag = no | yes .BR --frag = no | yes
Whether to enable the fragment mode [no] Whether to enable the fragment mode [no]
.TP .TP

20
misc.c
View File

@ -131,6 +131,26 @@ void mm_err_puts(const char *str)
} }
} }
void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp)
{
int ret;
ret = fwrite(p, size, nitems, fp);
if (ret == EOF) {
fprintf(stderr, "[ERROR] failed to write data\n");
exit(EXIT_FAILURE);
}
}
void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp)
{
int ret;
ret = fread(p, size, nitems, fp);
if (ret == EOF) {
fprintf(stderr, "[ERROR] failed to read data\n");
exit(EXIT_FAILURE);
}
}
#include "ksort.h" #include "ksort.h"
#define sort_key_128x(a) ((a).x) #define sort_key_128x(a) ((a).x)

View File

@ -28,6 +28,9 @@
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2)) #define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf) #define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
#define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -77,7 +80,7 @@ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_,
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r);
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos); void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
@ -86,7 +89,14 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs); void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs); void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
void mm_split_rm_tmp(const char *prefix, int n_splits);
void mm_err_puts(const char *str); void mm_err_puts(const char *str);
void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp);
void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp);
#ifdef __cplusplus #ifdef __cplusplus
} }

80
splitidx.c 100644
View File

@ -0,0 +1,80 @@
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include "mmpriv.h"
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi)
{
char *fn;
FILE *fp;
uint32_t i, k = mi->k;
fn = (char*)calloc(strlen(prefix) + 10, 1);
sprintf(fn, "%s.%.4d.tmp", prefix, mi->index);
fp = fopen(fn, "wb");
assert(fp);
mm_err_fwrite(&k, 4, 1, fp);
mm_err_fwrite(&mi->n_seq, 4, 1, fp);
for (i = 0; i < mi->n_seq; ++i) {
uint8_t l;
l = strlen(mi->seq[i].name);
mm_err_fwrite(&l, 1, 1, fp);
mm_err_fwrite(mi->seq[i].name, 1, l, fp);
mm_err_fwrite(&mi->seq[i].len, 4, 1, fp);
}
free(fn);
return fp;
}
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part)
{
mm_idx_t *mi = 0;
char *fn;
int i, j;
if (n_splits < 1) return 0;
fn = CALLOC(char, strlen(prefix) + 10);
for (i = 0; i < n_splits; ++i) {
sprintf(fn, "%s.%.4d.tmp", prefix, i);
if ((fp[i] = fopen(fn, "rb")) == 0) {
if (mm_verbose >= 1)
fprintf(stderr, "ERROR: failed to open temporary file '%s'\n", fn);
for (j = 0; j < i; ++j)
fclose(fp[j]);
free(fn);
return 0;
}
}
free(fn);
mi = CALLOC(mm_idx_t, 1);
for (i = 0; i < n_splits; ++i) {
mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same
mm_err_fread(&n_seq_part[i], 4, 1, fp[i]);
mi->n_seq += n_seq_part[i];
}
mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq);
for (i = j = 0; i < n_splits; ++i) {
uint32_t k;
for (k = 0; k < n_seq_part[i]; ++k, ++j) {
uint8_t l;
mm_err_fread(&l, 1, 1, fp[i]);
mi->seq[j].name = (char*)calloc(l + 1, 1);
mm_err_fread(mi->seq[j].name, 1, l, fp[i]);
mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]);
}
}
return mi;
}
void mm_split_rm_tmp(const char *prefix, int n_splits)
{
int i;
char *fn;
fn = CALLOC(char, strlen(prefix) + 10);
for (i = 0; i < n_splits; ++i) {
sprintf(fn, "%s.%.4d.tmp", prefix, i);
remove(fn);
}
free(fn);
}