bwasw: read and align two fastq at a time
This commit is contained in:
parent
a26096dd75
commit
06687a33b9
2
bwtsw2.h
2
bwtsw2.h
|
|
@ -39,7 +39,7 @@ extern "C" {
|
|||
|
||||
bsw2opt_t *bsw2_init_opt();
|
||||
bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool);
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn);
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2);
|
||||
void bsw2_destroy(bwtsw2_t *b);
|
||||
|
||||
bsw2global_t *bsw2_global_init();
|
||||
|
|
|
|||
76
bwtsw2_aux.c
76
bwtsw2_aux.c
|
|
@ -465,7 +465,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
|
|||
|
||||
/* Core routine to align reads in _seq. It is separated from
|
||||
* process_seqs() to realize multi-threading */
|
||||
static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target)
|
||||
static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
|
||||
{
|
||||
int x;
|
||||
bsw2opt_t opt = *_opt;
|
||||
|
|
@ -543,32 +543,32 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const
|
|||
|
||||
#ifdef HAVE_PTHREAD
|
||||
typedef struct {
|
||||
int tid;
|
||||
int tid, is_pe;
|
||||
bsw2seq_t *_seq;
|
||||
const bsw2opt_t *_opt;
|
||||
const bntseq_t *bns;
|
||||
uint8_t *pac;
|
||||
bwt_t *target;
|
||||
const bwt_t *target;
|
||||
} thread_aux_t;
|
||||
|
||||
/* another interface to bsw2_aln_core() to facilitate pthread_create() */
|
||||
static void *worker(void *data)
|
||||
{
|
||||
thread_aux_t *p = (thread_aux_t*)data;
|
||||
bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target);
|
||||
bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* process sequences stored in _seq, generate SAM lines for these
|
||||
* sequences and reset _seq afterwards. */
|
||||
static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target)
|
||||
static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
|
||||
{
|
||||
int i;
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (opt->n_threads <= 1) {
|
||||
bsw2_aln_core(0, _seq, opt, bns, pac, target);
|
||||
bsw2_aln_core(0, _seq, opt, bns, pac, target, is_pe);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
pthread_attr_t attr;
|
||||
|
|
@ -580,7 +580,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
|
|||
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (j = 0; j < opt->n_threads; ++j) {
|
||||
thread_aux_t *p = data + j;
|
||||
p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns;
|
||||
p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; p->is_pe = is_pe;
|
||||
p->pac = pac; p->target = target;
|
||||
pthread_create(&tid[j], &attr, worker, p);
|
||||
}
|
||||
|
|
@ -588,7 +588,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
|
|||
free(data); free(tid);
|
||||
}
|
||||
#else
|
||||
bsw2_aln_core(0, _seq, opt, bns, pac, target);
|
||||
bsw2_aln_core(0, _seq, opt, bns, pac, target, is_pe);
|
||||
#endif
|
||||
|
||||
// print and reset
|
||||
|
|
@ -603,11 +603,21 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
|
|||
_seq->n = 0;
|
||||
}
|
||||
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn)
|
||||
static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p)
|
||||
{
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
int l, size = 0;
|
||||
p->tid = -1;
|
||||
p->l = ks->seq.l;
|
||||
p->name = strdup(ks->name.s);
|
||||
p->seq = strdup(ks->seq.s);
|
||||
p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
|
||||
p->sam = 0;
|
||||
}
|
||||
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
|
||||
{
|
||||
gzFile fp, fp2;
|
||||
kseq_t *ks, *ks2;
|
||||
int l, size = 0, is_pe = 0;
|
||||
uint8_t *pac;
|
||||
bsw2seq_t *_seq;
|
||||
|
||||
|
|
@ -622,30 +632,42 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
|
|||
fp = xzopen(fn, "r");
|
||||
ks = kseq_init(fp);
|
||||
_seq = calloc(1, sizeof(bsw2seq_t));
|
||||
while ((l = kseq_read(ks)) >= 0) {
|
||||
bsw2seq1_t *p;
|
||||
if (fn2) {
|
||||
fp2 = xzopen(fn2, "r");
|
||||
ks2 = kseq_init(fp2);
|
||||
is_pe = 1;
|
||||
} else fp2 = 0, ks2 = 0, is_pe = 0;
|
||||
while (kseq_read(ks) >= 0) {
|
||||
if (_seq->n == _seq->max) {
|
||||
_seq->max = _seq->max? _seq->max<<1 : 1024;
|
||||
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
|
||||
}
|
||||
p = &_seq->seq[_seq->n++];
|
||||
p->tid = -1;
|
||||
p->l = l;
|
||||
p->name = strdup(ks->name.s);
|
||||
p->seq = strdup(ks->seq.s);
|
||||
p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
|
||||
p->sam = 0;
|
||||
size += l;
|
||||
kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]);
|
||||
size += ks->seq.l;
|
||||
if (ks2) {
|
||||
if (kseq_read(ks2) >= 0) {
|
||||
kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
|
||||
size += ks->seq.l;
|
||||
} else {
|
||||
fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__);
|
||||
is_pe = 0;
|
||||
}
|
||||
}
|
||||
if (size > opt->chunk_size) {
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size);
|
||||
process_seqs(_seq, opt, bns, pac, target);
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
|
||||
process_seqs(_seq, opt, bns, pac, target, is_pe);
|
||||
size = 0;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size);
|
||||
process_seqs(_seq, opt, bns, pac, target);
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
|
||||
process_seqs(_seq, opt, bns, pac, target, is_pe);
|
||||
// free
|
||||
free(pac);
|
||||
free(_seq->seq); free(_seq);
|
||||
kseq_destroy(ks);
|
||||
gzclose(fp);
|
||||
free(pac);
|
||||
if (fn2) {
|
||||
kseq_destroy(ks2);
|
||||
gzclose(fp2);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ int bwa_bwtsw2(int argc, char *argv[])
|
|||
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa>\n\n");
|
||||
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
|
||||
fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
|
||||
fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
|
||||
fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
|
||||
|
|
@ -65,15 +65,6 @@ int bwa_bwtsw2(int argc, char *argv[])
|
|||
fprintf(stderr, " increase '-z' for better sensitivity.\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
if (0) {
|
||||
double c, theta, eps, delta;
|
||||
c = opt->a / log(opt->yita);
|
||||
theta = exp(-opt->b / c) / opt->yita;
|
||||
eps = exp(-opt->q / c);
|
||||
delta = exp(-opt->r / c);
|
||||
fprintf(stderr, "mismatch: %lf, gap_open: %lf, gap_ext: %lf\n\n",
|
||||
theta, eps, delta);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -85,7 +76,7 @@ int bwa_bwtsw2(int argc, char *argv[])
|
|||
strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target);
|
||||
bns = bns_restore(argv[optind]);
|
||||
|
||||
bsw2_aln(opt, bns, target, argv[optind+1]);
|
||||
bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
|
||||
|
||||
bns_destroy(bns);
|
||||
bwt_destroy(target);
|
||||
|
|
|
|||
Loading…
Reference in New Issue