In bwtsw, replace the batch seq-reader with bseq

This commit is contained in:
Heng Li 2013-02-06 17:12:27 -05:00
parent 901d28d5f5
commit a09db69037
2 changed files with 19 additions and 37 deletions

2
bseq.c
View File

@ -48,7 +48,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
}
*n_ = n;
if (size < chunk_size) { // test if the 2nd file is finished
if (kseq_read(ks2) >= 0)
if (ks2 && kseq_read(ks2) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
return seqs;

View File

@ -13,6 +13,7 @@
#include "bwtsw2.h"
#include "stdaln.h"
#include "kstring.h"
#include "bseq.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
@ -756,24 +757,14 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
_seq->n = 0;
}
static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p)
{
p->tid = -1;
p->l = ks->seq.l;
p->name = strdup(ks->name.s);
p->seq = strdup(ks->seq.s);
p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
p->comment = ks->comment.l? strdup(ks->comment.s) : 0;
p->sam = 0;
}
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
{
gzFile fp, fp2;
kseq_t *ks, *ks2;
int l, size = 0, is_pe = 0;
int l, is_pe = 0, i, n;
uint8_t *pac;
bsw2seq_t *_seq;
bseq1_t *bseq;
pac = calloc(bns->l_pac/4+1, 1);
if (pac == 0) {
@ -791,34 +782,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
ks2 = kseq_init(fp2);
is_pe = 1;
} else fp2 = 0, ks2 = 0, is_pe = 0;
while (kseq_read(ks) >= 0) {
if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/')
ks->name.l -= 2, ks->name.s[ks->name.l] = 0;
if (_seq->n == _seq->max) {
_seq->max = _seq->max? _seq->max<<1 : 1024;
while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
int size = 0;
if (n > _seq->max) {
_seq->max = n;
kroundup32(_seq->max);
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
}
kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]);
size += ks->seq.l;
if (ks2) {
if (kseq_read(ks2) >= 0) {
if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/')
ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0;
kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
size += ks->seq.l;
} else {
fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__);
is_pe = 0;
}
}
if (size > opt->chunk_size * opt->n_threads) {
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
process_seqs(_seq, opt, bns, pac, target, is_pe);
size = 0;
_seq->n = n;
for (i = 0; i < n; ++i) {
bseq1_t *b = &bseq[i];
bsw2seq1_t *p = &_seq->seq[i];
p->tid = -1; p->l = b->l_seq;
p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
size += p->l;
}
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
free(bseq);
process_seqs(_seq, opt, bns, pac, target, is_pe);
}
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
process_seqs(_seq, opt, bns, pac, target, is_pe);
// free
free(pac);
free(_seq->seq); free(_seq);