r466: detect multi-part index more smartly
though it might not work in an extremely rare case: the end of a sequence ends at X*16384 and it is the last sequence in a batch. This can be resolved by never letting the kstream_t buffer empty.
This commit is contained in:
parent
1554149158
commit
7d50e646dd
16
format.c
16
format.c
|
|
@ -105,10 +105,15 @@ err_set_rg:
|
|||
free(rg_line);
|
||||
}
|
||||
|
||||
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
|
||||
void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int argc, char *argv[])
|
||||
{
|
||||
kstring_t str = {0,0,0};
|
||||
sam_write_rg_line(&str, rg);
|
||||
if (idx) {
|
||||
uint32_t i;
|
||||
for (i = 0; i < idx->n_seq; ++i)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
|
||||
}
|
||||
if (rg) sam_write_rg_line(&str, rg);
|
||||
mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
|
||||
if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
|
||||
if (argc > 1) {
|
||||
|
|
@ -213,13 +218,6 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
write_cs(km, s, mi, t, r);
|
||||
}
|
||||
|
||||
void mm_write_sam_SQ(const mm_idx_t *idx)
|
||||
{
|
||||
uint32_t i;
|
||||
for (i = 0; i < idx->n_seq; ++i)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
|
||||
}
|
||||
|
||||
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
|
||||
{
|
||||
extern unsigned char seq_comp_table[256];
|
||||
|
|
|
|||
21
index.c
21
index.c
|
|
@ -427,28 +427,28 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
|||
return mi;
|
||||
}
|
||||
|
||||
int mm_idx_is_idx(const char *fn)
|
||||
int64_t mm_idx_is_idx(const char *fn)
|
||||
{
|
||||
int fd, is_idx = 0;
|
||||
off_t ret;
|
||||
off_t ret, off_end;
|
||||
char magic[4];
|
||||
|
||||
if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
|
||||
fd = open(fn, O_RDONLY);
|
||||
if (fd < 0) return -1; // error
|
||||
if ((ret = lseek(fd, 0, SEEK_END)) >= 4) {
|
||||
if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
ret = read(fd, magic, 4);
|
||||
if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0)
|
||||
is_idx = 1;
|
||||
}
|
||||
close(fd);
|
||||
return is_idx;
|
||||
return is_idx? off_end : 0;
|
||||
}
|
||||
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
|
||||
{
|
||||
int is_idx;
|
||||
int64_t is_idx;
|
||||
mm_idx_reader_t *r;
|
||||
is_idx = mm_idx_is_idx(fn);
|
||||
if (is_idx < 0) return 0; // failed to open the index
|
||||
|
|
@ -456,8 +456,10 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, cons
|
|||
r->is_idx = is_idx;
|
||||
if (opt) r->opt = *opt;
|
||||
else mm_idxopt_init(&r->opt);
|
||||
if (r->is_idx) r->fp.idx = fopen(fn, "rb");
|
||||
else r->fp.seq = mm_bseq_open(fn);
|
||||
if (r->is_idx) {
|
||||
r->fp.idx = fopen(fn, "rb");
|
||||
r->idx_size = is_idx;
|
||||
} else r->fp.seq = mm_bseq_open(fn);
|
||||
if (fn_out) r->fp_out = fopen(fn_out, "wb");
|
||||
return r;
|
||||
}
|
||||
|
|
@ -485,3 +487,8 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
|||
}
|
||||
return mi;
|
||||
}
|
||||
|
||||
int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work
|
||||
{
|
||||
return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
|
||||
}
|
||||
|
|
|
|||
28
main.c
28
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.2-r465-dirty"
|
||||
#define MM_VERSION "2.2-r466-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -25,7 +25,7 @@ void liftrlimit() {}
|
|||
static struct option long_options[] = {
|
||||
{ "bucket-bits", required_argument, 0, 0 },
|
||||
{ "mb-size", required_argument, 0, 'K' },
|
||||
{ "int-rname", no_argument, 0, 0 }, // obsolete; kept as a placeholder
|
||||
{ "seed", required_argument, 0, 0 },
|
||||
{ "no-kalloc", no_argument, 0, 0 },
|
||||
{ "print-qname", no_argument, 0, 0 },
|
||||
{ "no-self", no_argument, 0, 0 },
|
||||
|
|
@ -35,11 +35,9 @@ static struct option long_options[] = {
|
|||
{ "print-aln-seq", no_argument, 0, 0 },
|
||||
{ "splice", no_argument, 0, 0 },
|
||||
{ "cost-non-gt-ag", required_argument, 0, 0 },
|
||||
{ "no-sam-sq", no_argument, 0, 0 },
|
||||
{ "no-long-join", no_argument, 0, 0 },
|
||||
{ "sr", no_argument, 0, 0 },
|
||||
{ "multi", optional_argument, 0, 0 },
|
||||
{ "no-long-join", no_argument, 0, 0 },
|
||||
{ "seed", required_argument, 0, 0 },
|
||||
{ "print-2nd", optional_argument, 0, 0 },
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "max-intron-len", required_argument, 0, 'G' },
|
||||
|
|
@ -119,6 +117,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'R') rg = optarg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == 0 && long_idx == 0) ipt.bucket_bits = atoi(optarg); // --bucket-bits
|
||||
else if (c == 0 && long_idx == 2) opt.seed = atoi(optarg); // --seed
|
||||
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
|
||||
else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname
|
||||
else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self
|
||||
|
|
@ -128,15 +127,13 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
|
||||
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
|
||||
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
|
||||
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
|
||||
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
|
||||
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
|
||||
else if (c == 0 && long_idx ==15) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
|
||||
else if (c == 0 && long_idx ==16) opt.seed = atoi(optarg); // --seed
|
||||
else if (c == 0 && long_idx ==14) { // --multi
|
||||
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
|
||||
opt.flag |= MM_F_MULTI_SEG;
|
||||
else opt.flag &= ~MM_F_MULTI_SEG;
|
||||
} else if (c == 0 && long_idx ==17) { // --print-2nd
|
||||
} else if (c == 0 && long_idx ==15) { // --print-2nd
|
||||
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
|
||||
opt.flag &= ~MM_F_NO_PRINT_2ND;
|
||||
else opt.flag |= MM_F_NO_PRINT_2ND;
|
||||
|
|
@ -235,11 +232,16 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
|
||||
return 1;
|
||||
}
|
||||
if (opt.flag & MM_F_OUT_SAM)
|
||||
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
|
||||
while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
|
||||
if (mm_verbose >= 2 && idx_rdr->n_parts > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
|
||||
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
|
||||
if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) {
|
||||
if (mm_idx_reader_eof(idx_rdr)) {
|
||||
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);
|
||||
} else {
|
||||
mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv);
|
||||
if (mm_verbose >= 2)
|
||||
fprintf(stderr, "[WARNING] \033[1;31mFor a multi-part index, no @SQ lines will be outputted.\033[0m\n");
|
||||
}
|
||||
}
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
|
||||
|
|
|
|||
2
map.c
2
map.c
|
|
@ -500,8 +500,6 @@ int mm_map_file_multi_seg(const mm_idx_t *idx, int n_segs, const char **fn, cons
|
|||
pl.opt = opt, pl.mi = idx;
|
||||
pl.n_threads = n_threads > 1? n_threads : 1;
|
||||
pl.mini_batch_size = opt->mini_batch_size;
|
||||
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
|
||||
mm_write_sam_SQ(idx);
|
||||
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
|
||||
free(pl.str.s);
|
||||
for (i = 0; i < n_segs; ++i)
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@
|
|||
#define MM_F_SPLICE_FOR 0x100 // match GT-AG
|
||||
#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG
|
||||
#define MM_F_NO_LJOIN 0x400
|
||||
#define MM_F_NO_SAM_SQ 0x800
|
||||
#define MM_F_SR 0x1000
|
||||
#define MM_F_MULTI_SEG 0x2000
|
||||
#define MM_F_NO_PRINT_2ND 0x4000
|
||||
|
|
@ -116,6 +115,7 @@ typedef struct {
|
|||
// index reader
|
||||
typedef struct {
|
||||
int is_idx, n_parts;
|
||||
int64_t idx_size;
|
||||
mm_idxopt_t opt;
|
||||
FILE *fp_out;
|
||||
union {
|
||||
|
|
@ -189,6 +189,8 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
|
|||
*/
|
||||
void mm_idx_reader_close(mm_idx_reader_t *r);
|
||||
|
||||
int mm_idx_reader_eof(const mm_idx_reader_t *r);
|
||||
|
||||
/**
|
||||
* Print index statistics to stderr
|
||||
*
|
||||
|
|
|
|||
|
|
@ -257,11 +257,6 @@ use
|
|||
.TP
|
||||
.B --version
|
||||
Print version number to stdout
|
||||
.TP
|
||||
.B --no-sam-hdr
|
||||
Don't output SAM header lines. Use this option if the index consists of
|
||||
multiple parts; otherwise the SAM output is malformated due to internal header
|
||||
lines.
|
||||
.SS Preset options
|
||||
.TP 10
|
||||
.BI -x \ STR
|
||||
|
|
|
|||
3
mmpriv.h
3
mmpriv.h
|
|
@ -54,8 +54,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
|||
|
||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||
|
||||
void mm_write_sam_SQ(const mm_idx_t *idx);
|
||||
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue