moved some common code to bwa.{c,h}

2013-02-23 15:30:46 -05:00 · 2013-02-23 15:30:46 -05:00 · e613195e17
parent 3c330d5049
commit e613195e17
8 changed files with 132 additions and 104 deletions
--- a/14
+++ b/14
@ -3,7 +3,7 @@ CFLAGS=		-g -Wall -O2
 CXXFLAGS=	$(CFLAGS)
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
-LOBJS=		utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o
+LOBJS=		utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
 AOBJS=		QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
 			is.o bwtindex.o bwape.o \
 			bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
@ -28,14 +28,16 @@ bwa:libbwa.a $(AOBJS) main.o
 libbwa.a:$(LOBJS)
 		$(AR) -csru $@ $(LOBJS)

+QSufSort.o:QSufSort.h
+bwt_gen.o:QSufSort.h
+
+ksw.o:ksw.h
+utils.o:utils.h ksort.h kseq.h
+bntseq.o:bntseq.h
+bwt.o:bwt.h utils.h
 bwa.o:bwa.h

-QSufSort.o:QSufSort.h
-
-bwt.o:bwt.h
-bwtio.o:bwt.h
 bwtaln.o:bwt.h bwtaln.h kseq.h
-bntseq.o:bntseq.h
 bwtgap.o:bwtgap.h bwtaln.h bwt.h

 bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
--- a/bwa.c
+++ b/bwa.c
@ -0,0 +1,96 @@
+#include <stdio.h>
+#include <zlib.h>
+#include "bntseq.h"
+#include "bwa.h"
+#include "ksw.h"
+
+/************************
+ * Batch FASTA/Q reader *
+ ************************/
+
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+static inline void trim_readno(kstring_t *s)
+{
+	if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
+		s->l -= 2, s->s[s->l] = 0;
+}
+
+static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
+{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
+	s->name = strdup(ks->name.s);
+	s->comment = ks->comment.l? strdup(s->comment) : 0;
+	s->seq = strdup(ks->seq.s);
+	s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
+	s->l_seq = strlen(s->seq);
+}
+
+bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
+{
+	kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
+	int size = 0, m, n;
+	bseq1_t *seqs;
+	m = n = 0; seqs = 0;
+	while (kseq_read(ks) >= 0) {
+		if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
+			fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
+			break;
+		}
+		if (n >= m) {
+			m = m? m<<1 : 256;
+			seqs = realloc(seqs, m * sizeof(bseq1_t));
+		}
+		trim_readno(&ks->name);
+		kseq2bseq1(ks, &seqs[n]);
+		size += seqs[n++].l_seq;
+		if (ks2) {
+			trim_readno(&ks2->name);
+			kseq2bseq1(ks2, &seqs[n]);
+			size += seqs[n++].l_seq;
+		}
+		if (size >= chunk_size) break;
+	}
+	if (size == 0) { // test if the 2nd file is finished
+		if (ks2 && kseq_read(ks2) >= 0)
+			fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
+	}
+	*n_ = n;
+	return seqs;
+}
+
+// Generate CIGAR when the alignment end points are known
+uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar)
+{
+	uint32_t *cigar = 0;
+	uint8_t tmp, *rseq;
+	int i, w;
+	int64_t rlen;
+	*n_cigar = 0;
+	if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
+	rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
+	if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
+	if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
+		for (i = 0; i < l_query>>1; ++i)
+			tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+		for (i = 0; i < rlen>>1; ++i)
+			tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
+	}
+	//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+	//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+	// set the band-width
+	w = (int)((double)(l_query * mat[0] - q) / r + 1.);
+	w = w < 1? w : 1;
+	w = w < w_? w : w_;
+	w += abs(rlen - l_query);
+	// NW alignment
+	*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+	if (rb >= l_pac) // reverse back query
+		for (i = 0; i < l_query>>1; ++i)
+			tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+
+ret_gen_cigar:
+	free(rseq);
+	return cigar;
+}
+
--- a/bwa.h
+++ b/bwa.h
@ -0,0 +1,23 @@
+#ifndef BWA_H_
+#define BWA_H_
+
+#include <stdint.h>
+
+typedef struct {
+	int l_seq;
+	char *name, *comment, *seq, *qual, *sam;
+} bseq1_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
+
+	uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/bwamem.c
+++ b/bwamem.c
@ -485,41 +485,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 * Basic hit->SAM conversion *
 *****************************/

-uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar)
-{
-	uint32_t *cigar = 0;
-	uint8_t tmp, *rseq;
-	int i, w;
-	int64_t rlen;
-	*n_cigar = 0;
-	if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
-	rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
-	if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
-	if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
-		for (i = 0; i < l_query>>1; ++i)
-			tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
-		for (i = 0; i < rlen>>1; ++i)
-			tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
-	}
-	//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
-	//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
-	// set the band-width
-	w = (int)((double)(l_query * mat[0] - q) / r + 1.);
-	w = w < 1? w : 1;
-	w = w < w_? w : w_;
-	w += abs(rlen - l_query);
-	// NW alignment
-	*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
-	if (rb >= l_pac) // reverse back query
-		for (i = 0; i < l_query>>1; ++i)
-			tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
-
-ret_gen_cigar:
-	free(rseq);
-	return cigar;
-}
-
-
 void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
 {
 #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
--- a/bwamem.h
+++ b/bwamem.h
@ -3,7 +3,7 @@

 #include "bwt.h"
 #include "bntseq.h"
-#include "utils.h"
+#include "bwa.h"

 #define MEM_MAPQ_COEF 40.0
 #define MEM_MAPQ_MAX  60
--- a/bwtsw2_aux.c
+++ b/bwtsw2_aux.c
@ -13,6 +13,7 @@
 #include "bwtsw2.h"
 #include "stdaln.h"
 #include "kstring.h"
+#include "bwa.h"

 #include "kseq.h"
 KSEQ_DECLARE(gzFile)
--- a/utils.c
+++ b/utils.c
@ -40,6 +40,9 @@
 KSORT_INIT(128, pair64_t, pair64_lt)
 KSORT_INIT(64,  uint64_t, ks_lt_generic)

+#include "kseq.h"
+KSEQ_INIT2(, gzFile, gzread)
+
 /********************
 * System utilities *
 ********************/
@ -160,58 +163,3 @@ double realtime()
 	gettimeofday(&tp, &tzp);
 	return tp.tv_sec + tp.tv_usec * 1e-6;
 }
-
-/************************
- * Batch FASTA/Q reader *
- ************************/
-
-#include "kseq.h"
-KSEQ_INIT2(, gzFile, gzread)
-
-static inline void trim_readno(kstring_t *s)
-{
-	if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
-		s->l -= 2, s->s[s->l] = 0;
-}
-
-static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
-{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
-	s->name = strdup(ks->name.s);
-	s->comment = ks->comment.l? strdup(s->comment) : 0;
-	s->seq = strdup(ks->seq.s);
-	s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
-	s->l_seq = strlen(s->seq);
-}
-
-bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
-{
-	kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
-	int size = 0, m, n;
-	bseq1_t *seqs;
-	m = n = 0; seqs = 0;
-	while (kseq_read(ks) >= 0) {
-		if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
-			fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
-			break;
-		}
-		if (n >= m) {
-			m = m? m<<1 : 256;
-			seqs = realloc(seqs, m * sizeof(bseq1_t));
-		}
-		trim_readno(&ks->name);
-		kseq2bseq1(ks, &seqs[n]);
-		size += seqs[n++].l_seq;
-		if (ks2) {
-			trim_readno(&ks2->name);
-			kseq2bseq1(ks2, &seqs[n]);
-			size += seqs[n++].l_seq;
-		}
-		if (size >= chunk_size) break;
-	}
-	if (size == 0) { // test if the 2nd file is finished
-		if (ks2 && kseq_read(ks2) >= 0)
-			fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
-	}
-	*n_ = n;
-	return seqs;
-}
--- a/utils.h
+++ b/utils.h
@ -52,11 +52,6 @@ typedef struct {
 typedef struct { size_t n, m; uint64_t *a; } uint64_v;
 typedef struct { size_t n, m; pair64_t *a; } pair64_v;

-typedef struct {
-	int l_seq;
-	char *name, *comment, *seq, *qual, *sam;
-} bseq1_t;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -80,8 +75,6 @@ extern "C" {
 	void ks_introsort_64 (size_t n, uint64_t *a);
 	void ks_introsort_128(size_t n, pair64_t *a);

-	bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
-
 #ifdef __cplusplus
 }
 #endif