r466: detect multi-part index more smartly

though it might not work in an extremely rare case: the end of a sequence ends
at X*16384 and it is the last sequence in a batch. This can be resolved by
never letting the kstream_t buffer empty.
This commit is contained in:
Heng Li 2017-10-04 17:32:58 -04:00
parent 1554149158
commit 7d50e646dd
7 changed files with 40 additions and 39 deletions

View File

@ -105,10 +105,15 @@ err_set_rg:
free(rg_line);
}
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int argc, char *argv[])
{
kstring_t str = {0,0,0};
sam_write_rg_line(&str, rg);
if (idx) {
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
}
if (rg) sam_write_rg_line(&str, rg);
mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
if (argc > 1) {
@ -213,13 +218,6 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
write_cs(km, s, mi, t, r);
}
void mm_write_sam_SQ(const mm_idx_t *idx)
{
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
}
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
{
extern unsigned char seq_comp_table[256];

21
index.c
View File

@ -427,28 +427,28 @@ mm_idx_t *mm_idx_load(FILE *fp)
return mi;
}
int mm_idx_is_idx(const char *fn)
int64_t mm_idx_is_idx(const char *fn)
{
int fd, is_idx = 0;
off_t ret;
off_t ret, off_end;
char magic[4];
if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
fd = open(fn, O_RDONLY);
if (fd < 0) return -1; // error
if ((ret = lseek(fd, 0, SEEK_END)) >= 4) {
if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
lseek(fd, 0, SEEK_SET);
ret = read(fd, magic, 4);
if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0)
is_idx = 1;
}
close(fd);
return is_idx;
return is_idx? off_end : 0;
}
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
{
int is_idx;
int64_t is_idx;
mm_idx_reader_t *r;
is_idx = mm_idx_is_idx(fn);
if (is_idx < 0) return 0; // failed to open the index
@ -456,8 +456,10 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, cons
r->is_idx = is_idx;
if (opt) r->opt = *opt;
else mm_idxopt_init(&r->opt);
if (r->is_idx) r->fp.idx = fopen(fn, "rb");
else r->fp.seq = mm_bseq_open(fn);
if (r->is_idx) {
r->fp.idx = fopen(fn, "rb");
r->idx_size = is_idx;
} else r->fp.seq = mm_bseq_open(fn);
if (fn_out) r->fp_out = fopen(fn_out, "wb");
return r;
}
@ -485,3 +487,8 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
}
return mi;
}
int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work
{
return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
}

28
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.2-r465-dirty"
#define MM_VERSION "2.2-r466-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -25,7 +25,7 @@ void liftrlimit() {}
static struct option long_options[] = {
{ "bucket-bits", required_argument, 0, 0 },
{ "mb-size", required_argument, 0, 'K' },
{ "int-rname", no_argument, 0, 0 }, // obsolete; kept as a placeholder
{ "seed", required_argument, 0, 0 },
{ "no-kalloc", no_argument, 0, 0 },
{ "print-qname", no_argument, 0, 0 },
{ "no-self", no_argument, 0, 0 },
@ -35,11 +35,9 @@ static struct option long_options[] = {
{ "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 0 },
{ "no-sam-sq", no_argument, 0, 0 },
{ "no-long-join", no_argument, 0, 0 },
{ "sr", no_argument, 0, 0 },
{ "multi", optional_argument, 0, 0 },
{ "no-long-join", no_argument, 0, 0 },
{ "seed", required_argument, 0, 0 },
{ "print-2nd", optional_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
@ -119,6 +117,7 @@ int main(int argc, char *argv[])
else if (c == 'R') rg = optarg;
else if (c == 'h') fp_help = stdout;
else if (c == 0 && long_idx == 0) ipt.bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) opt.seed = atoi(optarg); // --seed
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname
else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self
@ -128,15 +127,13 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
else if (c == 0 && long_idx ==15) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 0 && long_idx ==16) opt.seed = atoi(optarg); // --seed
else if (c == 0 && long_idx ==14) { // --multi
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag |= MM_F_MULTI_SEG;
else opt.flag &= ~MM_F_MULTI_SEG;
} else if (c == 0 && long_idx ==17) { // --print-2nd
} else if (c == 0 && long_idx ==15) { // --print-2nd
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag &= ~MM_F_NO_PRINT_2ND;
else opt.flag |= MM_F_NO_PRINT_2ND;
@ -235,11 +232,16 @@ int main(int argc, char *argv[])
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
return 1;
}
if (opt.flag & MM_F_OUT_SAM)
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
if (mm_verbose >= 2 && idx_rdr->n_parts > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) {
if (mm_idx_reader_eof(idx_rdr)) {
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);
} else {
mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv);
if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] \033[1;31mFor a multi-part index, no @SQ lines will be outputted.\033[0m\n");
}
}
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);

2
map.c
View File

@ -500,8 +500,6 @@ int mm_map_file_multi_seg(const mm_idx_t *idx, int n_segs, const char **fn, cons
pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads > 1? n_threads : 1;
pl.mini_batch_size = opt->mini_batch_size;
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
mm_write_sam_SQ(idx);
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
free(pl.str.s);
for (i = 0; i < n_segs; ++i)

View File

@ -16,7 +16,6 @@
#define MM_F_SPLICE_FOR 0x100 // match GT-AG
#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG
#define MM_F_NO_LJOIN 0x400
#define MM_F_NO_SAM_SQ 0x800
#define MM_F_SR 0x1000
#define MM_F_MULTI_SEG 0x2000
#define MM_F_NO_PRINT_2ND 0x4000
@ -116,6 +115,7 @@ typedef struct {
// index reader
typedef struct {
int is_idx, n_parts;
int64_t idx_size;
mm_idxopt_t opt;
FILE *fp_out;
union {
@ -189,6 +189,8 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
*/
void mm_idx_reader_close(mm_idx_reader_t *r);
int mm_idx_reader_eof(const mm_idx_reader_t *r);
/**
* Print index statistics to stderr
*

View File

@ -257,11 +257,6 @@ use
.TP
.B --version
Print version number to stdout
.TP
.B --no-sam-hdr
Don't output SAM header lines. Use this option if the index consists of
multiple parts; otherwise the SAM output is malformated due to internal header
lines.
.SS Preset options
.TP 10
.BI -x \ STR

View File

@ -54,8 +54,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
void mm_write_sam_SQ(const mm_idx_t *idx);
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);