diff --git a/align.c b/align.c index ea7bd29..84a7336 100644 --- a/align.c +++ b/align.c @@ -61,7 +61,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c return 0; } -static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron) +static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) { uint32_t k, l, toff = 0, qoff = 0; int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0; @@ -374,7 +374,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); + mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -417,7 +417,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; - mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); + mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); r_inv->id = -1; r_inv->parent = MM_PARENT_UNSET; r_inv->inv = 1; diff --git a/bseq.c b/bseq.c index e3e576f..30d285a 100644 --- a/bseq.c +++ b/bseq.c @@ -8,7 +8,6 @@ KSEQ_INIT(gzFile, gzread) struct mm_bseq_file_s { - int is_eof; gzFile fp; kseq_t *ks; }; @@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int size += seqs[n++].l_seq; if (size >= chunk_size) break; } - if (size < chunk_size) fp->is_eof = 1; *n_ = n; return seqs; } int mm_bseq_eof(mm_bseq_file_t *fp) { - return fp->is_eof; + return ks_eof(fp->ks->f); } diff --git a/format.c b/format.c index cf81291..949c0cd 100644 --- a/format.c +++ b/format.c @@ -6,7 +6,7 @@ #include "kalloc.h" #include "mmpriv.h" -static char *mm_rg_line, mm_rg_id[256]; +static char mm_rg_id[256]; static inline void str_enlarge(kstring_t *s, int l) { @@ -58,15 +58,13 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) s->s[s->l] = 0; } -static inline char *mm_escape(char *s) +static char *mm_escape(char *s) { char *p, *q; for (p = q = s; *p; ++p) { if (*p == '\\') { ++p; if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; else if (*p == '\\') *q++ = '\\'; } else *q++ = *p; } @@ -74,44 +72,56 @@ static inline char *mm_escape(char *s) return s; } -void mm_set_rg(const char *s) +static void sam_write_rg_line(kstring_t *str, const char *s) { char *p, *q, *r, *rg_line = 0; memset(mm_rg_id, 0, 256); - if (mm_rg_line) { - free(mm_rg_line); - mm_rg_line = 0; - } if (s == 0) return; if (strstr(s, "@RG") != s) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n"); goto err_set_rg; } if (strstr(s, "\t") != NULL) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal characters -- replace with escaped tabs: \\t\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal characters -- replace with escaped tabs: \\t\n"); goto err_set_rg; } rg_line = strdup(s); mm_escape(rg_line); if ((p = strstr(rg_line, "\tID:")) == 0) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n"); goto err_set_rg; } p += 4; for (q = p; *q && *q != '\t' && *q != '\n'; ++q); if (q - p + 1 > 256) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n"); goto err_set_rg; } for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q) *r++ = *q; - mm_rg_line = rg_line; - return; + mm_sprintf_lite(str, "%s\n", rg_line); err_set_rg: free(rg_line); } +void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]) +{ + kstring_t str = {0,0,0}; + sam_write_rg_line(&str, rg); + mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2"); + if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver); + if (argc > 1) { + int i; + mm_sprintf_lite(&str, "\tCL:minimap2"); + for (i = 1; i < argc; ++i) + mm_sprintf_lite(&str, " %s", argv[i]); + } + mm_sprintf_lite(&str, "\n"); + fputs(str.s, stdout); + free(str.s); +} + static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) { extern unsigned char seq_nt4_table[256]; @@ -214,12 +224,11 @@ static char comp_tab[] = { 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 }; -void sam_write_sam_header(const mm_idx_t *idx) +void mm_write_sam_SQ(const mm_idx_t *idx) { uint32_t i; for (i = 0; i < idx->n_seq; ++i) printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); - if (mm_rg_line) puts(mm_rg_line); } static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) @@ -274,8 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m else mm_sprintf_lite(s, "*"); } write_tags(s, r); - if (mm_rg_line && mm_rg_id[0]) - mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id); + if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id); if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist int i, n_sa = 0; // n_sa: number of SA fields for (i = 0; i < n_regs; ++i) diff --git a/index.c b/index.c index c139ec5..dc58231 100644 --- a/index.c +++ b/index.c @@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in) mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name) { pipeline_t pl; + if (fp == 0 || mm_bseq_eof(fp)) return 0; memset(&pl, 0, sizeof(pipeline_t)); pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size; pl.keep_name = keep_name; pl.batch_size = batch_size; pl.fp = fp; - if (pl.fp == 0) return 0; pl.mi = mm_idx_init(w, k, b, is_hpc); kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); diff --git a/main.c b/main.c index 0735b91..b040520 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r301-dirty" +#define MM_VERSION "2.0-r309-dirty" void liftrlimit() { @@ -33,6 +33,7 @@ static struct option long_options[] = { { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, { "cost-non-gt-ag", required_argument, 0, 0 }, + { "no-sam-sq", no_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -58,11 +59,11 @@ static inline int64_t mm_parse_num(const char *str) int main(int argc, char *argv[]) { mm_mapopt_t opt; - int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0; + int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0; int minibatch_size = 200000000; uint64_t batch_size = 4000000000ULL; mm_bseq_file_t *fp = 0; - char *fnw = 0, *s; + char *fnw = 0, *rg = 0, *s; FILE *fpr = 0, *fpw = 0, *fp_help = stderr; liftrlimit(); @@ -97,7 +98,7 @@ int main(int argc, char *argv[]) else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 'I') batch_size = mm_parse_num(optarg); else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); - else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c + else if (c == 'R') rg = optarg; else if (c == 'h') fp_help = stdout; else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname @@ -110,6 +111,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag + else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq else if (c == 'V') { puts(MM_VERSION); return 0; @@ -222,27 +224,31 @@ int main(int argc, char *argv[]) is_idx = mm_idx_is_idx(argv[optind]); if (is_idx < 0) { - fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]); + fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]); return 1; } if (!is_idx && fnw == 0 && argc - optind < 2) { - fprintf(stderr, "[E::%s] missing input: please specify a query file or option -d\n", __func__); + fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n"); return 1; } if (is_idx) fpr = fopen(argv[optind], "rb"); else fp = mm_bseq_open(argv[optind]); if (fnw) fpw = fopen(fnw, "wb"); + if (opt.flag & MM_F_OUT_SAM) + mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv); for (;;) { - mm_idx_t *mi = 0; + mm_idx_t *mi; if (fpr) { mi = mm_idx_load(fpr); if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc)) - fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n", - __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); - } else if (!mm_bseq_eof(fp)) { + fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n"); + } else { mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name); } if (mi == 0) break; + ++n_idx_part; + if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ)) + fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n"); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); diff --git a/map.c b/map.c index 5af4936..f531efa 100644 --- a/map.c +++ b/map.c @@ -385,7 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int if (pl.fp == 0) return -1; pl.opt = opt, pl.mi = idx; pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; - if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx); + if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ)) + mm_write_sam_SQ(idx); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); free(pl.str.s); mm_bseq_close(pl.fp); diff --git a/minimap.h b/minimap.h index c09b846..e345915 100644 --- a/minimap.h +++ b/minimap.h @@ -18,6 +18,7 @@ #define MM_F_SPLICE_FOR 0x100 #define MM_F_SPLICE_REV 0x200 #define MM_F_SPLICE_BOTH 0x400 +#define MM_F_NO_SAM_SQ 0x800 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index d2d2530..d5535c0 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools" +.TH minimap2 1 "25 August 2017" "minimap2-2.0-r309-dirty" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -255,8 +255,13 @@ and use .BR -K500m . .TP -.B -V +.B --version Print version number to stdout +.TP +.B --no-sam-hdr +Don't output SAM header lines. Use this option if the index consists of +multiple parts; otherwise the SAM output is malformated due to internal header +lines. .SS Preset options .TP 10 .BI -x \ STR @@ -320,7 +325,7 @@ is that this preset is not using HPC minimizers. Long-read spliced alignment .RB ( -k15 .B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag -.BR 4 ). +.BR 5 ). In the splice mode, 1) long deletions are taken as introns and represented as the .RB ` N ' diff --git a/mmpriv.h b/mmpriv.h index 874f9e2..3c40c28 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,8 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -void mm_set_rg(const char *s); -void sam_write_sam_header(const mm_idx_t *idx); +void mm_write_sam_SQ(const mm_idx_t *idx); +void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);