From e613195e172cea20b903ae848fc4bbf238e0a4c9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:30:46 -0500 Subject: [PATCH] moved some common code to bwa.{c,h} --- Makefile | 14 ++++---- bwa.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 23 +++++++++++++ bwamem.c | 35 ------------------- bwamem.h | 2 +- bwtsw2_aux.c | 1 + utils.c | 58 ++----------------------------- utils.h | 7 ---- 8 files changed, 132 insertions(+), 104 deletions(-) create mode 100644 bwa.c create mode 100644 bwa.h diff --git a/Makefile b/Makefile index bfed694..2029dc1 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ @@ -28,14 +28,16 @@ bwa:libbwa.a $(AOBJS) main.o libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) +QSufSort.o:QSufSort.h +bwt_gen.o:QSufSort.h + +ksw.o:ksw.h +utils.o:utils.h ksort.h kseq.h +bntseq.o:bntseq.h +bwt.o:bwt.h utils.h bwa.o:bwa.h -QSufSort.o:QSufSort.h - -bwt.o:bwt.h -bwtio.o:bwt.h bwtaln.o:bwt.h bwtaln.h kseq.h -bntseq.o:bntseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h diff --git a/bwa.c b/bwa.c new file mode 100644 index 0000000..eca721e --- /dev/null +++ b/bwa.c @@ -0,0 +1,96 @@ +#include +#include +#include "bntseq.h" +#include "bwa.h" +#include "ksw.h" + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} + +// Generate CIGAR when the alignment end points are known +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i, w; + int64_t rlen; + *n_cigar = 0; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; + } + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + w = (int)((double)(l_query * mat[0] - q) / r + 1.); + w = w < 1? w : 1; + w = w < w_? w : w_; + w += abs(rlen - l_query); + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} + diff --git a/bwa.h b/bwa.h new file mode 100644 index 0000000..022b784 --- /dev/null +++ b/bwa.h @@ -0,0 +1,23 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include + +typedef struct { + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwamem.c b/bwamem.c index 5202fb4..43f9f2f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -485,41 +485,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int * Basic hit->SAM conversion * *****************************/ -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) -{ - uint32_t *cigar = 0; - uint8_t tmp, *rseq; - int i, w; - int64_t rlen; - *n_cigar = 0; - if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range - if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - for (i = 0; i < rlen>>1; ++i) - tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; - } - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - // set the band-width - w = (int)((double)(l_query * mat[0] - q) / r + 1.); - w = w < 1? w : 1; - w = w < w_? w : w_; - w += abs(rlen - l_query); - // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); - if (rb >= l_pac) // reverse back query - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - -ret_gen_cigar: - free(rseq); - return cigar; -} - - void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) diff --git a/bwamem.h b/bwamem.h index 4319911..ce27c6e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,7 +3,7 @@ #include "bwt.h" #include "bntseq.h" -#include "utils.h" +#include "bwa.h" #define MEM_MAPQ_COEF 40.0 #define MEM_MAPQ_MAX 60 diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index c727984..bc12d20 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,6 +13,7 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bwa.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/utils.c b/utils.c index 1cebaab..20b09ee 100644 --- a/utils.c +++ b/utils.c @@ -40,6 +40,9 @@ KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) + /******************** * System utilities * ********************/ @@ -160,58 +163,3 @@ double realtime() gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } - -/************************ - * Batch FASTA/Q reader * - ************************/ - -#include "kseq.h" -KSEQ_INIT2(, gzFile, gzread) - -static inline void trim_readno(kstring_t *s) -{ - if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) - s->l -= 2, s->s[s->l] = 0; -} - -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) -{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; - s->seq = strdup(ks->seq.s); - s->qual = ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = strlen(s->seq); -} - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; - int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; - while (kseq_read(ks) >= 0) { - if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads - fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); - break; - } - if (n >= m) { - m = m? m<<1 : 256; - seqs = realloc(seqs, m * sizeof(bseq1_t)); - } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - size += seqs[n++].l_seq; - } - if (size >= chunk_size) break; - } - if (size == 0) { // test if the 2nd file is finished - if (ks2 && kseq_read(ks2) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); - } - *n_ = n; - return seqs; -} diff --git a/utils.h b/utils.h index 70f4e11..a3db251 100644 --- a/utils.h +++ b/utils.h @@ -52,11 +52,6 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; -typedef struct { - int l_seq; - char *name, *comment, *seq, *qual, *sam; -} bseq1_t; - #ifdef __cplusplus extern "C" { #endif @@ -80,8 +75,6 @@ extern "C" { void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - #ifdef __cplusplus } #endif