diff --git a/format.c b/format.c index deadbe1..c6b7272 100644 --- a/format.c +++ b/format.c @@ -105,10 +105,15 @@ err_set_rg: free(rg_line); } -void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]) +void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int argc, char *argv[]) { kstring_t str = {0,0,0}; - sam_write_rg_line(&str, rg); + if (idx) { + uint32_t i; + for (i = 0; i < idx->n_seq; ++i) + printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); + } + if (rg) sam_write_rg_line(&str, rg); mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2"); if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver); if (argc > 1) { @@ -213,13 +218,6 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m write_cs(km, s, mi, t, r); } -void mm_write_sam_SQ(const mm_idx_t *idx) -{ - uint32_t i; - for (i = 0; i < idx->n_seq; ++i) - printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); -} - static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) { extern unsigned char seq_comp_table[256]; diff --git a/index.c b/index.c index 4bc4c35..00f212c 100644 --- a/index.c +++ b/index.c @@ -427,28 +427,28 @@ mm_idx_t *mm_idx_load(FILE *fp) return mi; } -int mm_idx_is_idx(const char *fn) +int64_t mm_idx_is_idx(const char *fn) { int fd, is_idx = 0; - off_t ret; + off_t ret, off_end; char magic[4]; if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index fd = open(fn, O_RDONLY); if (fd < 0) return -1; // error - if ((ret = lseek(fd, 0, SEEK_END)) >= 4) { + if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) { lseek(fd, 0, SEEK_SET); ret = read(fd, magic, 4); if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0) is_idx = 1; } close(fd); - return is_idx; + return is_idx? off_end : 0; } mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out) { - int is_idx; + int64_t is_idx; mm_idx_reader_t *r; is_idx = mm_idx_is_idx(fn); if (is_idx < 0) return 0; // failed to open the index @@ -456,8 +456,10 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, cons r->is_idx = is_idx; if (opt) r->opt = *opt; else mm_idxopt_init(&r->opt); - if (r->is_idx) r->fp.idx = fopen(fn, "rb"); - else r->fp.seq = mm_bseq_open(fn); + if (r->is_idx) { + r->fp.idx = fopen(fn, "rb"); + r->idx_size = is_idx; + } else r->fp.seq = mm_bseq_open(fn); if (fn_out) r->fp_out = fopen(fn_out, "wb"); return r; } @@ -485,3 +487,8 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) } return mi; } + +int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work +{ + return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq); +} diff --git a/main.c b/main.c index f56327c..43d6ee3 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r465-dirty" +#define MM_VERSION "2.2-r466-dirty" #ifdef __linux__ #include @@ -25,7 +25,7 @@ void liftrlimit() {} static struct option long_options[] = { { "bucket-bits", required_argument, 0, 0 }, { "mb-size", required_argument, 0, 'K' }, - { "int-rname", no_argument, 0, 0 }, // obsolete; kept as a placeholder + { "seed", required_argument, 0, 0 }, { "no-kalloc", no_argument, 0, 0 }, { "print-qname", no_argument, 0, 0 }, { "no-self", no_argument, 0, 0 }, @@ -35,11 +35,9 @@ static struct option long_options[] = { { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, { "cost-non-gt-ag", required_argument, 0, 0 }, - { "no-sam-sq", no_argument, 0, 0 }, + { "no-long-join", no_argument, 0, 0 }, { "sr", no_argument, 0, 0 }, { "multi", optional_argument, 0, 0 }, - { "no-long-join", no_argument, 0, 0 }, - { "seed", required_argument, 0, 0 }, { "print-2nd", optional_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, @@ -119,6 +117,7 @@ int main(int argc, char *argv[]) else if (c == 'R') rg = optarg; else if (c == 'h') fp_help = stdout; else if (c == 0 && long_idx == 0) ipt.bucket_bits = atoi(optarg); // --bucket-bits + else if (c == 0 && long_idx == 2) opt.seed = atoi(optarg); // --seed else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self @@ -128,15 +127,13 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag - else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq + else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr - else if (c == 0 && long_idx ==15) opt.flag |= MM_F_NO_LJOIN; // --no-long-join - else if (c == 0 && long_idx ==16) opt.seed = atoi(optarg); // --seed else if (c == 0 && long_idx ==14) { // --multi if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) opt.flag |= MM_F_MULTI_SEG; else opt.flag &= ~MM_F_MULTI_SEG; - } else if (c == 0 && long_idx ==17) { // --print-2nd + } else if (c == 0 && long_idx ==15) { // --print-2nd if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) opt.flag &= ~MM_F_NO_PRINT_2ND; else opt.flag |= MM_F_NO_PRINT_2ND; @@ -235,11 +232,16 @@ int main(int argc, char *argv[]) fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n"); return 1; } - if (opt.flag & MM_F_OUT_SAM) - mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv); while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) { - if (mm_verbose >= 2 && idx_rdr->n_parts > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ)) - fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n"); + if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) { + if (mm_idx_reader_eof(idx_rdr)) { + mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv); + } else { + mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv); + if (mm_verbose >= 2) + fprintf(stderr, "[WARNING] \033[1;31mFor a multi-part index, no @SQ lines will be outputted.\033[0m\n"); + } + } if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); diff --git a/map.c b/map.c index 38cbf15..8b5a9aa 100644 --- a/map.c +++ b/map.c @@ -500,8 +500,6 @@ int mm_map_file_multi_seg(const mm_idx_t *idx, int n_segs, const char **fn, cons pl.opt = opt, pl.mi = idx; pl.n_threads = n_threads > 1? n_threads : 1; pl.mini_batch_size = opt->mini_batch_size; - if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ)) - mm_write_sam_SQ(idx); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); free(pl.str.s); for (i = 0; i < n_segs; ++i) diff --git a/minimap.h b/minimap.h index 51a0742..47ae44f 100644 --- a/minimap.h +++ b/minimap.h @@ -16,7 +16,6 @@ #define MM_F_SPLICE_FOR 0x100 // match GT-AG #define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG #define MM_F_NO_LJOIN 0x400 -#define MM_F_NO_SAM_SQ 0x800 #define MM_F_SR 0x1000 #define MM_F_MULTI_SEG 0x2000 #define MM_F_NO_PRINT_2ND 0x4000 @@ -116,6 +115,7 @@ typedef struct { // index reader typedef struct { int is_idx, n_parts; + int64_t idx_size; mm_idxopt_t opt; FILE *fp_out; union { @@ -189,6 +189,8 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads); */ void mm_idx_reader_close(mm_idx_reader_t *r); +int mm_idx_reader_eof(const mm_idx_reader_t *r); + /** * Print index statistics to stderr * diff --git a/minimap2.1 b/minimap2.1 index 3bd5191..6ab8112 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -257,11 +257,6 @@ use .TP .B --version Print version number to stdout -.TP -.B --no-sam-hdr -Don't output SAM header lines. Use this option if the index consists of -multiple parts; otherwise the SAM output is malformated due to internal header -lines. .SS Preset options .TP 10 .BI -x \ STR diff --git a/mmpriv.h b/mmpriv.h index 6806a6b..a48a480 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -54,8 +54,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); -void mm_write_sam_SQ(const mm_idx_t *idx); -void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]); +void mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);