#include #include #include #include #include "bseq.h" #include "kseq.h" KSEQ_INIT2(, gzFile, gzread) static inline void trim_readno(kstring_t *s) { if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) s->l -= 2, s->s[s->l] = 0; } static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice s->name = strdup(ks->name.s); s->comment = ks->comment.l? strdup(s->comment) : 0; s->seq = strdup(ks->seq.s); s->qual = ks->qual.l? strdup(ks->qual.s) : 0; s->l_seq = strlen(s->seq); } bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) { kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; int size = 0, m, n; bseq1_t *seqs; m = n = 0; seqs = 0; while (kseq_read(ks) >= 0) { if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); break; } if (n >= m) { m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); } trim_readno(&ks->name); kseq2bseq1(ks, &seqs[n]); size += seqs[n++].l_seq; if (ks2) { trim_readno(&ks2->name); kseq2bseq1(ks2, &seqs[n++]); size += seqs[n++].l_seq; } if (size >= chunk_size) break; } *n_ = n; if (size < chunk_size) { // test if the 2nd file is finished if (kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } return seqs; }