get batch sequence reader ready for paired-end
This commit is contained in:
parent
ef84e8b4e7
commit
5400191097
65
bseq.c
65
bseq.c
|
|
@ -4,12 +4,16 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "bseq.h"
|
||||
#include "kvec.h"
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT2(, gzFile, gzread)
|
||||
|
||||
#define CHECK_PAIR_THRES 1000000
|
||||
|
||||
struct mm_bseq_file_s {
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
mm_bseq1_t s;
|
||||
};
|
||||
|
||||
mm_bseq_file_t *mm_bseq_open(const char *fn)
|
||||
|
|
@ -31,32 +35,61 @@ void mm_bseq_close(mm_bseq_file_t *fp)
|
|||
free(fp);
|
||||
}
|
||||
|
||||
static inline int mm_qname_same(const char *s1, const char *s2)
|
||||
{
|
||||
int l1, l2;
|
||||
l1 = strlen(s1);
|
||||
l2 = strlen(s2);
|
||||
if (l1 != l2 || l1 < 3) return 0;
|
||||
if (!(s1[l1-1] >= '1' && s1[l1-1] <= '2' && s1[l1-2] == '/')) return 0;
|
||||
if (!(s2[l2-1] >= '1' && s2[l2-1] <= '2' && s2[l2-2] == '/')) return 0;
|
||||
return (strncmp(s1, s2, l1 - 2) == 0);
|
||||
}
|
||||
|
||||
static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual)
|
||||
{
|
||||
s->name = strdup(ks->name.s);
|
||||
s->seq = strdup(ks->seq.s);
|
||||
s->qual = with_qual && ks->qual.l? strdup(ks->qual.s) : 0;
|
||||
s->l_seq = ks->seq.l;
|
||||
}
|
||||
|
||||
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_)
|
||||
{
|
||||
int size = 0, m, n;
|
||||
mm_bseq1_t *seqs;
|
||||
int64_t size = 0;
|
||||
kvec_t(mm_bseq1_t) a = {0,0,0};
|
||||
kseq_t *ks = fp->ks;
|
||||
m = n = 0; seqs = 0;
|
||||
if (fp->s.seq) {
|
||||
kv_resize(mm_bseq1_t, 0, a, 256);
|
||||
kv_push(mm_bseq1_t, 0, a, fp->s);
|
||||
size = fp->s.l_seq;
|
||||
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
||||
}
|
||||
while (kseq_read(ks) >= 0) {
|
||||
mm_bseq1_t *s;
|
||||
assert(ks->seq.l <= INT32_MAX);
|
||||
if (n >= m) {
|
||||
m = m? m<<1 : 256;
|
||||
seqs = (mm_bseq1_t*)realloc(seqs, m * sizeof(mm_bseq1_t));
|
||||
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
|
||||
kv_pushp(mm_bseq1_t, 0, a, &s);
|
||||
kseq2bseq(ks, s, with_qual);
|
||||
size += s->l_seq;
|
||||
if (size >= chunk_size) {
|
||||
if (a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
|
||||
while (kseq_read(ks) >= 0) {
|
||||
kseq2bseq(ks, &fp->s, with_qual);
|
||||
if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
|
||||
kv_push(mm_bseq1_t, 0, a, fp->s);
|
||||
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
||||
} else break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
s = &seqs[n];
|
||||
s->name = strdup(ks->name.s);
|
||||
s->seq = strdup(ks->seq.s);
|
||||
s->qual = with_qual && ks->qual.l? strdup(ks->qual.s) : 0;
|
||||
s->l_seq = ks->seq.l;
|
||||
size += seqs[n++].l_seq;
|
||||
if (size >= chunk_size) break;
|
||||
}
|
||||
*n_ = n;
|
||||
return seqs;
|
||||
*n_ = a.n;
|
||||
return a.a;
|
||||
}
|
||||
|
||||
int mm_bseq_eof(mm_bseq_file_t *fp)
|
||||
{
|
||||
return ks_eof(fp->ks->f);
|
||||
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue