diff --git a/bseq.c b/bseq.c index 94b7fca..95f063d 100644 --- a/bseq.c +++ b/bseq.c @@ -4,12 +4,16 @@ #include #include #include "bseq.h" +#include "kvec.h" #include "kseq.h" KSEQ_INIT2(, gzFile, gzread) +#define CHECK_PAIR_THRES 1000000 + struct mm_bseq_file_s { gzFile fp; kseq_t *ks; + mm_bseq1_t s; }; mm_bseq_file_t *mm_bseq_open(const char *fn) @@ -31,32 +35,61 @@ void mm_bseq_close(mm_bseq_file_t *fp) free(fp); } +static inline int mm_qname_same(const char *s1, const char *s2) +{ + int l1, l2; + l1 = strlen(s1); + l2 = strlen(s2); + if (l1 != l2 || l1 < 3) return 0; + if (!(s1[l1-1] >= '1' && s1[l1-1] <= '2' && s1[l1-2] == '/')) return 0; + if (!(s2[l2-1] >= '1' && s2[l2-1] <= '2' && s2[l2-2] == '/')) return 0; + return (strncmp(s1, s2, l1 - 2) == 0); +} + +static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual) +{ + s->name = strdup(ks->name.s); + s->seq = strdup(ks->seq.s); + s->qual = with_qual && ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = ks->seq.l; +} + mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_) { - int size = 0, m, n; - mm_bseq1_t *seqs; + int64_t size = 0; + kvec_t(mm_bseq1_t) a = {0,0,0}; kseq_t *ks = fp->ks; - m = n = 0; seqs = 0; + if (fp->s.seq) { + kv_resize(mm_bseq1_t, 0, a, 256); + kv_push(mm_bseq1_t, 0, a, fp->s); + size = fp->s.l_seq; + memset(&fp->s, 0, sizeof(mm_bseq1_t)); + } while (kseq_read(ks) >= 0) { mm_bseq1_t *s; assert(ks->seq.l <= INT32_MAX); - if (n >= m) { - m = m? m<<1 : 256; - seqs = (mm_bseq1_t*)realloc(seqs, m * sizeof(mm_bseq1_t)); + if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256); + kv_pushp(mm_bseq1_t, 0, a, &s); + kseq2bseq(ks, s, with_qual); + size += s->l_seq; + if (size >= chunk_size) { + if (a.a[a.n-1].l_seq < CHECK_PAIR_THRES) { + while (kseq_read(ks) >= 0) { + kseq2bseq(ks, &fp->s, with_qual); + if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) { + kv_push(mm_bseq1_t, 0, a, fp->s); + memset(&fp->s, 0, sizeof(mm_bseq1_t)); + } else break; + } + } + break; } - s = &seqs[n]; - s->name = strdup(ks->name.s); - s->seq = strdup(ks->seq.s); - s->qual = with_qual && ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = ks->seq.l; - size += seqs[n++].l_seq; - if (size >= chunk_size) break; } - *n_ = n; - return seqs; + *n_ = a.n; + return a.a; } int mm_bseq_eof(mm_bseq_file_t *fp) { - return ks_eof(fp->ks->f); + return (ks_eof(fp->ks->f) && fp->s.seq == 0); }