r574: build index without sequences

This commit is contained in:
Heng Li 2017-11-11 21:38:38 -05:00
parent 2f463b1db0
commit 131cfc6938
2 changed files with 34 additions and 16 deletions

37
index.c
View File

@ -246,7 +246,6 @@ static void *worker_pipeline(void *shared, int step, void *in)
s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
if (s->seq) {
uint32_t old_m, m;
uint64_t sum_len, old_max_len, max_len;
assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
// make room for p->mi->seq
old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
@ -254,13 +253,16 @@ static void *worker_pipeline(void *shared, int step, void *in)
if (old_m != m)
p->mi->seq = (mm_idx_seq_t*)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
// make room for p->mi->S
for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
old_max_len = (p->sum_len + 7) / 8;
max_len = (p->sum_len + sum_len + 7) / 8;
kroundup64(old_max_len); kroundup64(max_len);
if (old_max_len != max_len) {
p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
if (!(p->mi->flag & MM_I_NO_SEQ)) {
uint64_t sum_len, old_max_len, max_len;
for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
old_max_len = (p->sum_len + 7) / 8;
max_len = (p->sum_len + sum_len + 7) / 8;
kroundup64(old_max_len); kroundup64(max_len);
if (old_max_len != max_len) {
p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
}
}
// populate p->mi->seq
for (i = 0; i < s->n_seq; ++i) {
@ -273,10 +275,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
seq->len = s->seq[i].l_seq;
seq->offset = p->sum_len;
// copy the sequence
for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
uint64_t o = p->sum_len + j;
int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
mm_seq4_set(p->mi->S, o, c);
if (!(p->mi->flag & MM_I_NO_SEQ)) {
for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
uint64_t o = p->sum_len + j;
int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
mm_seq4_set(p->mi->S, o, c);
}
}
// update p->sum_len and p->mi->n_seq
p->sum_len += seq->len;
@ -370,7 +374,8 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
fwrite(x, 8, 2, fp);
}
}
fwrite(mi->S, 4, (sum_len + 7) / 8, fp);
if (!(mi->flag & MM_I_NO_SEQ))
fwrite(mi->S, 4, (sum_len + 7) / 8, fp);
fflush(fp);
}
@ -420,8 +425,10 @@ mm_idx_t *mm_idx_load(FILE *fp)
kh_val(h, k) = x[1];
}
}
mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
fread(mi->S, 4, (sum_len + 7) / 8, fp);
if (!(mi->flag & MM_I_NO_SEQ)) {
mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
fread(mi->S, 4, (sum_len + 7) / 8, fp);
}
return mi;
}

13
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.5-r573-dirty"
#define MM_VERSION "2.5-r574-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -43,6 +43,7 @@ static struct option long_options[] = {
{ "end-bonus", required_argument, 0, 0 },
{ "no-pairing", no_argument, 0, 0 },
{ "splice-flank", optional_argument, 0, 0 },
{ "idx-no-seq", no_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -138,6 +139,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus
else if (c == 0 && long_idx ==18) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing
else if (c == 0 && long_idx ==20) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq
else if (c == 0 && long_idx == 14) { // --frag
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag |= MM_F_FRAG_MODE;
@ -196,6 +198,8 @@ int main(int argc, char *argv[])
fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n");
return 1;
}
if (!fnw && !(opt.flag&MM_F_CIGAR))
ipt.flag |= MM_I_NO_SEQ;
if (argc == optind || fp_help == stdout) {
fprintf(fp_help, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
@ -259,11 +263,18 @@ int main(int argc, char *argv[])
}
if (!idx_rdr->is_idx && fnw == 0 && argc - optind < 2) {
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
mm_idx_reader_close(idx_rdr);
return 1;
}
if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n");
while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) {
fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n");
mm_idx_destroy(mi);
mm_idx_reader_close(idx_rdr);
return 1;
}
if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) {
if (mm_idx_reader_eof(idx_rdr)) {
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);