diff --git a/format.c b/format.c index a8d681e..4ca7de6 100644 --- a/format.c +++ b/format.c @@ -1,7 +1,9 @@ #include #include #include +#include #include +#include "kalloc.h" #include "mmpriv.h" static inline void str_enlarge(kstring_t *s, int l) @@ -54,6 +56,60 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) s->s[s->l] = 0; } +static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) +{ + extern unsigned char seq_nt4_table[256]; + int i, q_off, t_off; + uint8_t *qseq, *tseq; + char *tmp; + if (r->p == 0) return; + mm_sprintf_lite(s, "\tcs:Z:"); + qseq = (uint8_t*)kmalloc(km, r->qe - r->qs); + tseq = (uint8_t*)kmalloc(km, r->re - r->rs); + tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1); + mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq); + if (!r->rev) { + for (i = r->qs; i < r->qe; ++i) + qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]]; + } else { + for (i = r->qs; i < r->qe; ++i) + qseq[r->qe - i - 1] = seq_nt4_table[(uint8_t)t->seq[i]]; + } + for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) { + int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; + assert(op >= 0 && op <= 2); + if (op == 0) { + int l_tmp = 0; + for (j = 0; j < len; ++j) { + if (qseq[q_off + j] != tseq[t_off + j]) { + if (l_tmp > 0) { + tmp[l_tmp] = 0; + mm_sprintf_lite(s, "=%s", tmp); + } + mm_sprintf_lite(s, "*%c%c", "ACGTN"[tseq[t_off + j]], "ACGTN"[qseq[q_off + j]]); + } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]]; + } + if (l_tmp > 0) { + tmp[l_tmp] = 0; + mm_sprintf_lite(s, "=%s", tmp); + } + q_off += len, t_off += len; + } else if (op == 1) { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "ACGTN"[qseq[q_off + j]]; + mm_sprintf_lite(s, "+%s", tmp); + q_off += len; + } else if (op == 2) { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "ACGTN"[tseq[t_off + j]]; + mm_sprintf_lite(s, "-%s", tmp); + t_off += len; + } + } + assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); + kfree(km, qseq); kfree(km, tseq); kfree(km, tmp); +} + static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S'; @@ -63,7 +119,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); } -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); @@ -74,12 +130,14 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen); mm_sprintf_lite(s, "\t%d", r->mapq); write_tags(s, r); - if (r->p) { + if (r->p && (opt_flag & MM_F_OUT_CG)) { uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); for (k = 0; k < r->p->n_cigar; ++k) mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); } + if (r->p && (opt_flag & MM_F_OUT_CS)) + write_cs(km, s, mi, t, r); } static char comp_tab[] = { diff --git a/main.c b/main.c index c8650d0..e536bbc 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r232" +#define MM_VERSION "2.0rc1-r235-dirty" void liftrlimit() { @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -67,7 +67,8 @@ int main(int argc, char *argv[]) else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); - else if (c == 'c') opt.flag |= MM_F_CIGAR; + else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR; + else if (c == 'S') opt.flag |= MM_F_OUT_CS | MM_F_CIGAR; else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF; else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR; else if (c == 'Q') opt.flag |= MM_F_NO_QUAL; @@ -166,6 +167,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); fprintf(stderr, " -c output CIGAR in PAF\n"); + fprintf(stderr, " -S output the cs tag in PAF\n"); fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); fprintf(stderr, " -K NUM minibatch size [200M]\n"); // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); diff --git a/map.c b/map.c index bfb5d3d..4ea111e 100644 --- a/map.c +++ b/map.c @@ -338,16 +338,18 @@ static void *worker_pipeline(void *shared, int step, void *in) kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq); return in; } else if (step == 2) { // step 2: output + void *km = 0; step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); + if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]); - else mm_write_paf(&p->str, mi, t, r); + else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); puts(p->str.s); } if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { @@ -360,6 +362,7 @@ static void *worker_pipeline(void *shared, int step, void *in) if (s->seq[i].qual) free(s->seq[i].qual); } free(s->reg); free(s->n_reg); free(s->seq); + km_destroy(km); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq); free(s); diff --git a/minimap.h b/minimap.h index 27eb3b4..076017f 100644 --- a/minimap.h +++ b/minimap.h @@ -12,6 +12,8 @@ #define MM_F_CIGAR 0x04 #define MM_F_OUT_SAM 0x08 #define MM_F_NO_QUAL 0x10 +#define MM_F_OUT_CG 0x20 +#define MM_F_OUT_CS 0x40 #define MM_IDX_MAGIC "MMI\2" diff --git a/mmpriv.h b/mmpriv.h index dcb78d6..32b3fd1 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,7 +40,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);