diff --git a/bseq.c b/bseq.c index 0889851..54a25f6 100644 --- a/bseq.c +++ b/bseq.c @@ -48,7 +48,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) } *n_ = n; if (size < chunk_size) { // test if the 2nd file is finished - if (kseq_read(ks2) >= 0) + if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } return seqs; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 619930b..a18ffc8 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,6 +13,7 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -756,24 +757,14 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) -{ - p->tid = -1; - p->l = ks->seq.l; - p->name = strdup(ks->name.s); - p->seq = strdup(ks->seq.s); - p->qual = ks->qual.l? strdup(ks->qual.s) : 0; - p->comment = ks->comment.l? strdup(ks->comment.s) : 0; - p->sam = 0; -} - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) { gzFile fp, fp2; kseq_t *ks, *ks2; - int l, size = 0, is_pe = 0; + int l, is_pe = 0, i, n; uint8_t *pac; bsw2seq_t *_seq; + bseq1_t *bseq; pac = calloc(bns->l_pac/4+1, 1); if (pac == 0) { @@ -791,34 +782,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while (kseq_read(ks) >= 0) { - if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/') - ks->name.l -= 2, ks->name.s[ks->name.l] = 0; - if (_seq->n == _seq->max) { - _seq->max = _seq->max? _seq->max<<1 : 1024; + while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int size = 0; + if (n > _seq->max) { + _seq->max = n; + kroundup32(_seq->max); _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } - kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); - size += ks->seq.l; - if (ks2) { - if (kseq_read(ks2) >= 0) { - if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/') - ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0; - kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge - size += ks->seq.l; - } else { - fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__); - is_pe = 0; - } - } - if (size > opt->chunk_size * opt->n_threads) { - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); - size = 0; + _seq->n = n; + for (i = 0; i < n; ++i) { + bseq1_t *b = &bseq[i]; + bsw2seq1_t *p = &_seq->seq[i]; + p->tid = -1; p->l = b->l_seq; + p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; + size += p->l; } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); + free(bseq); + process_seqs(_seq, opt, bns, pac, target, is_pe); } - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); // free free(pac); free(_seq->seq); free(_seq);