From a742f10164327305175ef4defb72def6421f9730 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Sep 2017 15:17:17 -0400 Subject: [PATCH] get multi-seg code ready; probably not working yet --- bseq.c | 12 ------------ bseq.h | 12 ++++++++++++ map.c | 36 ++++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/bseq.c b/bseq.c index 87da0c3..2ee29b2 100644 --- a/bseq.c +++ b/bseq.c @@ -1,7 +1,6 @@ #include #include #include -#include #include #include "bseq.h" #include "kvec.h" @@ -35,17 +34,6 @@ void mm_bseq_close(mm_bseq_file_t *fp) free(fp); } -static inline int mm_qname_same(const char *s1, const char *s2) -{ - int l1, l2; - l1 = strlen(s1); - l2 = strlen(s2); - if (l1 != l2 || l1 < 3) return 0; - if (!(s1[l1-1] >= '1' && s1[l1-1] <= '2' && s1[l1-2] == '/')) return 0; - if (!(s2[l2-1] >= '1' && s2[l2-1] <= '2' && s2[l2-2] == '/')) return 0; - return (strncmp(s1, s2, l1 - 2) == 0); -} - static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual) { s->name = strdup(ks->name.s); diff --git a/bseq.h b/bseq.h index 31ed256..9124273 100644 --- a/bseq.h +++ b/bseq.h @@ -2,6 +2,7 @@ #define MM_BSEQ_H #include +#include #ifdef __cplusplus extern "C" { @@ -24,6 +25,17 @@ int mm_bseq_eof(mm_bseq_file_t *fp); extern unsigned char seq_nt4_table[256]; +static inline int mm_qname_same(const char *s1, const char *s2) +{ + int l1, l2; + l1 = strlen(s1); + l2 = strlen(s2); + if (l1 != l2 || l1 < 3) return 0; + if (!(s1[l1-1] >= '1' && s1[l1-1] <= '2' && s1[l1-2] == '/')) return 0; + if (!(s2[l2-1] >= '1' && s2[l2-1] <= '2' && s2[l2-2] == '/')) return 0; + return (strncmp(s1, s2, l1 - 2) == 0); +} + #ifdef __cplusplus } #endif diff --git a/map.c b/map.c index 37e8c22..50dcb6d 100644 --- a/map.c +++ b/map.c @@ -75,7 +75,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->min_dp_max = 200; } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { io->is_hpc = 0, io->k = 21, io->w = 11; - mo->flag |= MM_F_SR; + mo->flag |= MM_F_SR | MM_F_MULTI_SEG; mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1; mo->max_gap = 100; mo->pri_ratio = 0.5f; @@ -328,17 +328,28 @@ typedef struct { const pipeline_t *p; int n_seq, n_frag; mm_bseq1_t *seq; - int *n_reg, *n_seg; + int *n_reg, *seg_off, *n_seg; mm_reg1_t **reg; mm_tbuf_t **buf; } step_t; static void worker_for(void *_data, long i, int tid) // kt_for() callback { - step_t *step = (step_t*)_data; + step_t *s = (step_t*)_data; + int *qlens, j, off = s->seg_off[i]; + const char **qseqs; + mm_tbuf_t *b = s->buf[tid]; if (mm_dbg_flag & MM_DBG_PRINT_QNAME) - fprintf(stderr, "QR\t%s\t%d\n", step->seq[i].name, tid); - step->reg[i] = mm_map(step->p->mi, step->seq[i].l_seq, step->seq[i].seq, &step->n_reg[i], step->buf[tid], step->p->opt, step->seq[i].name); + fprintf(stderr, "QR\t%s\t%d\n", s->seq[off].name, tid); + qlens = (int*)kmalloc(b->km, s->n_seg[i] * sizeof(int)); + qseqs = (const char**)kmalloc(b->km, s->n_seg[i] * sizeof(const char**)); + for (j = 0; j < s->n_seg[i]; ++j) { + qlens[j] = s->seq[off + j].l_seq; + qseqs[j] = s->seq[off + j].seq; + } + mm_map_multi(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); + kfree(b->km, qlens); + kfree(b->km, qseqs); } static void *worker_pipeline(void *shared, int step, void *in) @@ -347,10 +358,11 @@ static void *worker_pipeline(void *shared, int step, void *in) pipeline_t *p = (pipeline_t*)shared; if (step == 0) { // step 0: read sequences int with_qual = (!!(p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_NO_QUAL)); + int multi_seg = (p->n_fp > 1 || !!(p->opt->flag & MM_F_MULTI_SEG)); step_t *s; s = (step_t*)calloc(1, sizeof(step_t)); if (p->n_fp > 1) s->seq = mm_bseq_read_multi(p->n_fp, p->fp, p->mini_batch_size, with_qual, &s->n_seq); - else s->seq = mm_bseq_read2(p->fp[0], p->mini_batch_size, with_qual, !!(p->opt->flag&MM_F_MULTI_SEG), &s->n_seq); + else s->seq = mm_bseq_read2(p->fp[0], p->mini_batch_size, with_qual, multi_seg, &s->n_seq); if (s->seq) { s->p = p; for (i = 0; i < s->n_seq; ++i) @@ -358,12 +370,20 @@ static void *worker_pipeline(void *shared, int step, void *in) s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*)); for (i = 0; i < p->n_threads; ++i) s->buf[i] = mm_tbuf_init(); - s->n_reg = (int*)calloc(s->n_seq, sizeof(int)); + s->n_reg = (int*)calloc(3 * s->n_seq, sizeof(int)); + s->seg_off = s->n_reg + s->n_seq; + s->n_seg = s->seg_off + s->n_seq; s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*)); + for (i = 1, j = 0; i <= s->n_seq; ++i) + if (i == s->n_seq || !multi_seg || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) { + s->n_seg[s->n_frag] = i - j; + s->seg_off[s->n_frag++] = j; + j = i; + } return s; } else free(s); } else if (step == 1) { // step 1: map - kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq); + kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag); return in; } else if (step == 2) { // step 2: output void *km = 0;