From 39083be9abdf6bfd2ca19ff661379420c4d84715 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 25 Jun 2017 16:13:54 -0400 Subject: [PATCH] separated formating printing for SAM in future; and for performance --- Makefile | 9 +++---- align.c | 4 ---- format.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ map.c | 20 +++------------- mmpriv.h | 15 +++++++++++- 5 files changed, 94 insertions(+), 26 deletions(-) create mode 100644 format.c diff --git a/Makefile b/Makefile index b0d1f2a..a75bd1d 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 -Wc++-compat -Wno-unused-function CPPFLAGS= INCLUDES= -I. OBJS= kalloc.o kthread.o misc.o bseq.o sketch.o chain.o align.o sdust.o \ - index.o map.o ksw2_extz2_sse.o + index.o format.o map.o ksw2_extz2_sse.o PROG= minimap2 PROG_EXTRA= sdust LIBS= -lm -lz -lpthread @@ -34,14 +34,15 @@ depend: # DO NOT DELETE -align.o: minimap.h ksw2.h +align.o: minimap.h mmpriv.h bseq.h ksw2.h bseq.o: bseq.h kseq.h -chain.o: minimap.h mmpriv.h kalloc.h +chain.o: minimap.h mmpriv.h bseq.h kalloc.h +format.o: mmpriv.h minimap.h bseq.h index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h kalloc.o: kalloc.h ksw2_extz2_sse.o: ksw2.h main.o: bseq.h minimap.h mmpriv.h -map.o: kthread.h kvec.h kalloc.h sdust.h minimap.h mmpriv.h bseq.h +map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h misc.o: minimap.h ksort.h sdust.o: kalloc.h kdq.h kvec.h sdust.h sketch.o: kvec.h kalloc.h minimap.h diff --git a/align.c b/align.c index fdc3ac7..993792c 100644 --- a/align.c +++ b/align.c @@ -4,10 +4,6 @@ #include "mmpriv.h" #include "ksw2.h" -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b) { int i, j; diff --git a/format.c b/format.c new file mode 100644 index 0000000..37355e1 --- /dev/null +++ b/format.c @@ -0,0 +1,72 @@ +#include +#include +#include +#include +#include "mmpriv.h" + +static inline void str_enlarge(kstring_t *s, int l) +{ + if (s->l + l + 1 > s->m) { + s->m = s->l + l + 1; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + +static inline void str_copy(kstring_t *s, const char *st, const char *en) +{ + str_enlarge(s, en - st); + memcpy(&s->s[s->l], st, en - st); + s->l += en - st; +} + +static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) +{ + char buf[16]; // for integer to string conversion + const char *p, *q; + va_list ap; + va_start(ap, fmt); + for (q = p = fmt; *p; ++p) { + if (*p == '%') { + if (p > q) str_copy(s, q, p); + ++p; + if (*p == 'd') { + int c, i, l = 0; + unsigned int x; + c = va_arg(ap, int); + x = c >= 0? c : -c; + do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); + if (c < 0) buf[l++] = '-'; + str_enlarge(s, l); + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + } else if (*p == 's') { + char *r = va_arg(ap, char*); + str_copy(s, r, r + strlen(r)); + } else if (*p == 'c') { + str_enlarge(s, 1); + s->s[s->l++] = va_arg(ap, int); + } else abort(); + q = p + 1; + } + } + if (p > q) str_copy(s, q, p); + va_end(ap); + s->s[s->l] = 0; +} + +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_reg1_t *r) +{ + s->l = 0; + mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev], mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re); + if (r->p) mm_sprintf_lite(s, "\t%d\t%d\t255", r->p->blen - r->p->n_ambi - r->p->n_diff, r->p->blen); + else mm_sprintf_lite(s, "\t%d\t%d\t255", r->score, r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs); + mm_sprintf_lite(s, "\tcm:i:%d", r->cnt); + if (r->p) mm_sprintf_lite(s, "\ts1:i:%d", r->score); + if (r->parent == which) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); + if (r->p) { + uint32_t k; + mm_sprintf_lite(s, "\tNM:i:%d\tAS:i:%d\tnn:i:%d\tcg:Z:", r->p->n_diff, r->p->score, r->p->n_ambi); + for (k = 0; k < r->p->n_cigar; ++k) + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); + } +} diff --git a/map.c b/map.c index a43c0f0..cad8a5c 100644 --- a/map.c +++ b/map.c @@ -4,7 +4,6 @@ #include "kvec.h" #include "kalloc.h" #include "sdust.h" -#include "minimap.h" #include "mmpriv.h" #include "bseq.h" @@ -320,6 +319,7 @@ typedef struct { const mm_mapopt_t *opt; bseq_file_t *fp; const mm_idx_t *mi; + kstring_t str; } pipeline_t; typedef struct { @@ -368,22 +368,8 @@ static void *worker_pipeline(void *shared, int step, void *in) bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; - printf("%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); - if (mi->seq[r->rid].name) fputs(mi->seq[r->rid].name, stdout); - else printf("%d", r->rid + 1); - printf("\t%d\t%d\t%d", mi->seq[r->rid].len, r->rs, r->re); - if (r->p) printf("\t%d\t%d\t255", r->p->blen - r->p->n_ambi - r->p->n_diff, r->p->blen); - else printf("\t%d\t%d\t255", r->score, r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs); - printf("\tcm:i:%d", r->cnt); - if (r->p) printf("\ts1:i:%d", r->score); - if (r->parent == j) printf("\ts2:i:%d", r->subsc); - if (r->p) { - uint32_t k; - printf("\tNM:i:%d\tAS:i:%d\tnn:i:%d\tcg:Z:", r->p->n_diff, r->p->score, r->p->n_ambi); - for (k = 0; k < r->p->n_cigar; ++k) - printf("%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); - } - putchar('\n'); + mm_write_paf(&p->str, mi, t, j, r); + puts(p->str.s); free(r->p); } free(s->reg[i]); diff --git a/mmpriv.h b/mmpriv.h index c5456cd..8527284 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -2,6 +2,19 @@ #define MMPRIV2_H #include "minimap.h" +#include "bseq.h" + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + unsigned l, m; + char *s; +} kstring_t; +#endif #ifdef __cplusplus extern "C" { @@ -14,8 +27,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, bseq1_t *t, int which, mm_reg1_t *r); int mm_chain_dp(int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km); - mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); #ifdef __cplusplus