diff --git a/format.c b/format.c index 37355e1..56267f6 100644 --- a/format.c +++ b/format.c @@ -58,9 +58,9 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_re { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev], mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re); - if (r->p) mm_sprintf_lite(s, "\t%d\t%d\t255", r->p->blen - r->p->n_ambi - r->p->n_diff, r->p->blen); - else mm_sprintf_lite(s, "\t%d\t%d\t255", r->score, r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs); - mm_sprintf_lite(s, "\tcm:i:%d", r->cnt); + if (r->p) mm_sprintf_lite(s, "\t%d\t%d", r->p->blen - r->p->n_ambi - r->p->n_diff, r->p->blen); + else mm_sprintf_lite(s, "\t%d\t%d", r->score, r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs); + mm_sprintf_lite(s, "\t%d\tcm:i:%d", r->mapq, r->cnt); if (r->p) mm_sprintf_lite(s, "\ts1:i:%d", r->score); if (r->parent == which) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); if (r->p) { @@ -70,3 +70,50 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_re mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); } } + +static char comp_tab[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', + 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, + 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', + 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 +}; + +static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) +{ + if (rev) { + int i; + str_enlarge(s, l); + for (i = 0; i < l; ++i) { + int c = seq[l - 1 - i]; + s->s[s->l + i] = c < 128 && comp? comp_tab[c] : c; + } + s->l += l; + } else str_copy(s, seq, seq + l); +} + +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_reg1_t *r) +{ + int flag = 0; + s->l = 0; + if (r->rev) flag |= 0x10; + if (r->parent != which) flag |= 0x80; + mm_sprintf_lite(s, "%s\t%d\t%s\t%d\t%d\t", t->name, flag, mi->seq[r->rid].name, r->rs+1, r->mapq); + if (r->p) { // TODO: using hard clippings + uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs; + if (clip_len) mm_sprintf_lite(s, "%dS", clip_len); + for (k = 0; k < r->p->n_cigar; ++k) + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); + clip_len = r->rev? r->qs : t->l_seq - r->qe; + if (clip_len) mm_sprintf_lite(s, "%dS", clip_len); + } else mm_sprintf_lite(s, "*"); + mm_sprintf_lite(s, "\t*\t0\t0\t"); + sam_write_sq(s, t->seq, t->l_seq, r->rev, r->rev); + mm_sprintf_lite(s, "\t*\tcm:i:%d", r->cnt); + if (r->p) mm_sprintf_lite(s, "\ts1:i:%d", r->score); + if (r->parent == which) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); + if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->score, r->p->n_ambi); +} diff --git a/main.c b/main.c index 9ef7119..6764255 100644 --- a/main.c +++ b/main.c @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:Ng:I:d:ST:s:Dx:Hp:m:z:")) >= 0) { + while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:Ng:I:d:ST:s:Dx:Hp:m:z:F:")) >= 0) { if (c == 'w') w = atoi(optarg); else if (c == 'k') k = atoi(optarg); else if (c == 'b') b = atoi(optarg); @@ -85,6 +85,13 @@ int main(int argc, char *argv[]) else if (*p == 'K' || *p == 'k') x *= 1e3; if (c == 'B') mini_batch_size = (uint64_t)(x + .499); else batch_size = (uint64_t)(x + .499); + } else if (c == 'F') { + if (strcmp(optarg, "sam") == 0) opt.flag |= MM_F_OUT_SAM; + else if (strcmp(optarg, "paf") == 0) opt.flag &= ~MM_F_OUT_SAM; + else { + fprintf(stderr, "[E::%s] unknown output format '%s'\n", __func__, optarg); + return 0; + } } else if (c == 'x') { if (strcmp(optarg, "ava10k") == 0) { opt.flag |= MM_F_AVA | MM_F_NO_SELF; diff --git a/map.c b/map.c index cad8a5c..037bc65 100644 --- a/map.c +++ b/map.c @@ -306,7 +306,6 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, m if (opt->sdust_thres > 0) mm_dust_minier(&b->mini, l_seq, seq, opt->sdust_thres, b->sdb); regs = mm_map_frag(opt, mi, b, 0, b->mini.n, qname, l_seq, seq, n_regs); -// mm_select_sub(opt->mask_level, opt->pri_ratio, n_regs, regs, b->km); return regs; } @@ -368,7 +367,8 @@ static void *worker_pipeline(void *shared, int step, void *in) bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; - mm_write_paf(&p->str, mi, t, j, r); + if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, j, r); + else mm_write_paf(&p->str, mi, t, j, r); puts(p->str.s); free(r->p); } @@ -389,7 +389,13 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int if (pl.fp == 0) return -1; pl.opt = opt, pl.mi = idx; pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; + if (opt->flag & MM_F_OUT_SAM) { + uint32_t i; + for (i = 0; i < idx->n_seq; ++i) + printf("@SQ\tID:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); + } kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); + free(pl.str.s); bseq_close(pl.fp); return 0; } diff --git a/minimap.h b/minimap.h index e9dafb1..ec54832 100644 --- a/minimap.h +++ b/minimap.h @@ -7,8 +7,9 @@ #define MM_IDX_DEF_B 14 -#define MM_F_NO_SELF 0x2 -#define MM_F_AVA 0x8 +#define MM_F_NO_SELF 0x01 +#define MM_F_AVA 0x02 +#define MM_F_OUT_SAM 0x04 #define MM_IDX_MAGIC "MMI\2" @@ -57,6 +58,7 @@ typedef struct { int32_t qs, qe, rs, re; int32_t parent, subsc; int32_t as; + int32_t mapq, n_sub; // TODO: n_sub is not used for now mm_extra_t *p; } mm_reg1_t; diff --git a/mmpriv.h b/mmpriv.h index 8527284..b74072b 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -28,6 +28,7 @@ void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_reg1_t *r); +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_reg1_t *r); int mm_chain_dp(int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);