diff --git a/Makefile b/Makefile index 60c2104..0f9cae0 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,10 @@ CC= gcc #CC= clang --analyze CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS +# set to 1 if you wish to have bam support, type 'make clean; make all' +USE_HTSLIB=0 AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ @@ -11,21 +13,33 @@ AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= -LIBS= -lm -lz -lpthread +LIBS= -lm -lz -lpthread SUBDIRS= . .SUFFIXES:.c .o .cc .c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ +ifeq ($(USE_HTSLIB),1) + $(CC) -c $(CFLAGS) $(DFLAGS) -DUSE_HTSLIB $(INCLUDES) -I ../htslib $< -o $@ +else + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ +endif all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) +ifeq ($(USE_HTSLIB),1) + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ ../htslib/libhts.a -L. -L../htslib -lbwa $(LIBS) +else + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) +endif bwamem-lite:libbwa.a example.o - $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) +ifeq ($(USE_HTSLIB),1) + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ ../htslib/libhts.a -L. -L../htslib -lbwa $(LIBS) +else + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) +endif libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) diff --git a/bamlite.c b/bamlite.c index 3704beb..f1c2e62 100644 --- a/bamlite.c +++ b/bamlite.c @@ -1,3 +1,4 @@ +#ifndef USE_HTSLIB #include #include #include @@ -208,3 +209,4 @@ int bamlite_gzclose(gzFile file) { return ret; } #endif /* USE_VERBOSE_ZLIB_WRAPPERS */ +#endif diff --git a/bamlite.h b/bamlite.h index efab7ac..443926d 100644 --- a/bamlite.h +++ b/bamlite.h @@ -1,5 +1,6 @@ #ifndef BAMLITE_H_ #define BAMLITE_H_ +#ifndef USE_HTSLIB #include #include @@ -112,3 +113,4 @@ extern "C" { #endif #endif +#endif diff --git a/bwa.c b/bwa.c index 43d8a58..b6493b2 100644 --- a/bwa.c +++ b/bwa.c @@ -7,6 +7,9 @@ #include "ksw.h" #include "utils.h" #include "kstring.h" +#ifdef USE_HTSLIB +#include +#endif #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -274,6 +277,19 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) err_printf("%s\n", bwa_pg); } +#ifdef USE_HTSLIB +void bwa_format_sam_hdr(const bntseq_t *bns, const char *rg_line, kstring_t *str) +{ + int i; + extern char *bwa_pg; + str->l = 0; str->s = 0; + for (i = 0; i < bns->n_seqs; ++i) + ksprintf(str, "@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (rg_line) ksprintf(str, "%s\n", rg_line); + ksprintf(str, "%s\n", bwa_pg); +} +#endif + static char *bwa_escape(char *s) { char *p, *q; @@ -319,3 +335,26 @@ err_set_rg: return 0; } +#ifdef USE_HTSLIB +bams_t *bams_init() { + return calloc(1, sizeof(bams_t)); +} + +void bams_add(bams_t *bams, bam1_t *b) { + if (bams->m == bams->l) { + bams->m = bams->m << 2; + bams->bams = realloc(bams->bams, sizeof(bam1_t) * bams->m); + } + bams->bams[bams->l] = b; + bams->l++; +} + +void bams_destroy(bams_t *bams) { + int i; + for (i = 0; i < bams->l; i++) { + bam_destroy1(bams->bams[i]); + } + free(bams->bams); + free(bams); +} +#endif diff --git a/bwa.h b/bwa.h index bbc2525..3da8698 100644 --- a/bwa.h +++ b/bwa.h @@ -4,6 +4,9 @@ #include #include "bntseq.h" #include "bwt.h" +#ifdef USE_HTSLIB +#include +#endif #define BWA_IDX_BWT 0x1 #define BWA_IDX_BNS 0x2 @@ -16,11 +19,31 @@ typedef struct { uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base } bwaidx_t; +#ifdef USE_HTSLIB +typedef struct { + int l, m; + bam1_t **bams; +} bams_t; +#endif + typedef struct { int l_seq; +#ifdef USE_HTSLIB + char *name, *comment, *seq, *qual; + bams_t *bams; +#else char *name, *comment, *seq, *qual, *sam; +#endif } bseq1_t; +// This is here to faciliate passing around HTSLIB's bam_hdr_t structure when we are not compiling with HTSLIB +#ifndef USE_HTSLIB +typedef struct { + void *ptr; +} bam_hdr_t; // DO NOT USE +#endif + + extern int bwa_verbose; extern char bwa_rg_id[256]; @@ -41,8 +64,17 @@ extern "C" { void bwa_idx_destroy(bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); +#ifdef USE_HTSLIB + void bwa_format_sam_hdr(const bntseq_t *bns, const char *rg_line, kstring_t *str); +#endif char *bwa_set_rg(const char *s); +#ifdef USE_HTSLIB + bams_t *bams_init(); + void bams_add(bams_t *bams, bam1_t *b); + void bams_destroy(bams_t *bams); +#endif + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 76871e0..46b53e1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -15,6 +15,10 @@ #include "ksort.h" #include "utils.h" +#ifdef USE_HTSLIB +#include +#endif + #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif @@ -805,7 +809,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar) return l; } -void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all) +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all, bam_hdr_t *h) { int i, l_name; mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert @@ -922,6 +926,15 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); + +#ifdef USE_HTSLIB + bam1_t *b = bam_init1(); + if (sam_parse1(str, h, b) < 0) { + // TODO: error! + } + bams_add(s->bams, b); + str->l = 0; str->s = 0; +#endif } /************************ @@ -955,7 +968,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) } // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible -void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h) { extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); kstring_t str; @@ -987,14 +1000,16 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; - mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP); + mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP, h); } else { for (k = 0; k < aa.n; ++k) - mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP); + mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP, h); for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); free(aa.a); } +#ifndef USE_HTSLIB s->sam = str.s; +#endif if (XA) { for (k = 0; k < a->n; ++k) free(XA[k]); free(XA); @@ -1120,6 +1135,7 @@ typedef struct { bseq1_t *seqs; mem_alnreg_v *regs; int64_t n_processed; + bam_hdr_t *h; } worker_t; static void worker1(void *data, int i, int tid) @@ -1138,26 +1154,33 @@ static void worker1(void *data, int i, int tid) static void worker2(void *data, int i, int tid) { - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], bam_hdr_t *h); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { +#ifdef USE_HTSLIB + w->seqs[i].bams = bams_init(); +#endif if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); if (w->opt->flag & MEM_F_ALN_REG) { mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); } else { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); - mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, w->h); } free(w->regs[i].a); } else { if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); +#ifdef USE_HTSLIB + w->seqs[i<<1].bams = bams_init(); + w->seqs[1+(i<<1)].bams = bams_init(); +#endif + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], w->h); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, bam_hdr_t *h) { extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); worker_t w; @@ -1169,12 +1192,13 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn ctime = cputime(); rtime = realtime(); global_bns = bns; regs = malloc(n * sizeof(mem_alnreg_v)); - w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; + w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; w.pes = &pes[0]; w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); for (i = 0; i < opt->n_threads; ++i) w.aux[i] = smem_aux_init(); + w.h = h; kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions for (i = 0; i < opt->n_threads; ++i) smem_aux_destroy(w.aux[i]); diff --git a/bwamem.h b/bwamem.h index 7b14ca8..b822664 100644 --- a/bwamem.h +++ b/bwamem.h @@ -50,6 +50,9 @@ typedef struct { int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int max_hits; // if there are max_hits or fewer, output them all int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset +#ifdef USE_HTSLIB + int bam_output; +#endif } mem_opt_t; typedef struct { @@ -123,8 +126,9 @@ extern "C" { * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. + * @param h the BAM header, NULL if not using HTSLIB */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, bam_hdr_t *h); /** * Find the aligned regions for one query sequence diff --git a/bwamem_extra.c b/bwamem_extra.c index 59bb68e..ff28555 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -80,7 +80,11 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * return ar; } +#ifdef USE_HTSLIB +void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, bam_hdr_t *h) +#else void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +#endif { int i; kstring_t str = {0,0,0}; @@ -102,7 +106,15 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); kputc('\n', &str); } +#ifdef USE_HTSLIB + bam1_t *b = bam_init1(); + if (sam_parse1(&str, h, b) < 0) { + // TODO: error! + } + bams_add(s->bams, b); +#else s->sam = str.s; +#endif } // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. diff --git a/bwamem_pair.c b/bwamem_pair.c index a3aeb80..80e98ac 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -242,12 +242,12 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) -int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], bam_hdr_t *header) { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); - extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); - extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all); + extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h); + extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all, bam_hdr_t *h); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; @@ -327,8 +327,14 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // write SAM h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0; h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0; - mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0; - mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s; + mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP, header); +#ifndef USE_HTSLIB + s[0].sam = strdup(str.s); str.l = 0; /// not using HTSLIB +#endif + mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP, header); +#ifndef USE_HTSLIB + s[1].sam = str.s; // not using HTSLIB +#endif if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); // h[0].XA will be freed in the following block free(h[1].cigar); @@ -354,8 +360,8 @@ no_pairing: d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; } - mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); - mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1], header); + mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0], header); if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); free(h[1].cigar); return n; diff --git a/bwaseqio.c b/bwaseqio.c index d850307..3512d7d 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -2,7 +2,11 @@ #include #include "bwtaln.h" #include "utils.h" +#ifdef USE_HTSLIB +#include +#else #include "bamlite.h" +#endif #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -17,7 +21,12 @@ static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4 struct __bwa_seqio_t { // for BAM input int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE +#ifdef USE_HTSLIB + samFile *fp; + bam_hdr_t *h; +#else bamFile fp; +#endif // for fastq input kseq_t *ks; }; @@ -25,14 +34,24 @@ struct __bwa_seqio_t { bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; +#ifndef USE_HTSLIB bam_header_t *h; +#endif bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; +#ifdef USE_HTSLIB + bs->fp = sam_open(fn, "rb"); +#else bs->fp = bam_open(fn, "r"); +#endif if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); +#ifdef USE_HTSLIB + bs->h = sam_hdr_read(bs->fp); +#else h = bam_header_read(bs->fp); bam_header_destroy(h); +#endif return bs; } @@ -50,7 +69,12 @@ void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; if (bs->is_bam) { +#ifdef USE_HTSLIB + if (0 != sam_close(bs->fp)) err_fatal_simple("Error closing sam/bam file"); + bam_hdr_destroy(bs->h); +#else if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); +#endif } else { err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); @@ -101,7 +125,11 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); +#ifdef USE_HTSLIB + while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { +#else while ((res = bam_read1(bs->fp, b)) >= 0) { +#endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; @@ -114,14 +142,26 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; +#ifdef USE_HTSLIB + s = bam_get_seq(b); q = bam_get_qual(b); +#else s = bam1_seq(b); q = bam1_qual(b); +#endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { +#ifdef USE_HTSLIB + p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; +#else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; +#endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } +#ifdef USE_HTSLIB + if (bam_is_rev(b)) { // then reverse +#else if (bam1_strand(b)) { // then reverse +#endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } @@ -130,7 +170,11 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); +#ifdef USE_HTSLIB + p->name = strdup((const char*)bam_get_qname(b)); +#else p->name = strdup((const char*)bam1_qname(b)); +#endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); diff --git a/fastmap.c b/fastmap.c index 479e878..ebcb826 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,11 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); +#ifdef USE_HTSLIB + while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:o:")) >= 0) { +#else while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) { +#endif if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -120,7 +124,13 @@ int main_mem(int argc, char *argv[]) if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); +#ifdef USE_HTSLIB + } else if (c == 'o') { + opt->bam_output = atoi(optarg); } +#else + } +#endif else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; @@ -165,6 +175,9 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); +#ifdef USE_HTSLIB + fprintf(stderr, " -o INT 0 - BAM (compressed), 1 - BAM (uncompressed), 2 - SAM [%d]\n", opt->bam_output); +#endif fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); @@ -226,8 +239,33 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - if (!(opt->flag & MEM_F_ALN_REG)) + bam_hdr_t *h = NULL; // TODO +#ifdef USE_HTSLIB + samFile *out = NULL; + char *modes[] = {"wb", "wbu", "w"}; + switch (opt->bam_output) { + case 0: // BAM compressed + case 1: // BAM uncompressed + case 2: // SAM + out = sam_open("-", modes[opt->bam_output]); + break; + default: + fprintf(stderr, "Error: output format was out of range [%d]\n", opt->bam_output); + return 1; + } +#endif + if (!(opt->flag & MEM_F_ALN_REG)) { +#ifdef USE_HTSLIB + kstring_t str; + str.l = str.m = 0; str.s = 0; + bwa_format_sam_hdr(idx->bns, rg_line, &str); + h = sam_hdr_parse(str.l, str.s); + h->l_text = str.l; h->text = str.s; + sam_hdr_write(out, h); +#else bwa_print_sam_hdr(idx->bns, rg_line); +#endif + } while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if ((opt->flag & MEM_F_PE) && (n&1) == 1) { @@ -242,11 +280,22 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0, h); n_processed += n; for (i = 0; i < n; ++i) { +#ifdef USE_HTSLIB + if (seqs[i].bams) { + int j; + for (j = 0; j < seqs[i].bams->l; j++) { + sam_write1(out, h, seqs[i].bams->bams[j]); + } + } + bams_destroy(seqs[i].bams); seqs[i].bams = NULL; +#else if (seqs[i].sam) err_fputs(seqs[i].sam, stdout); - free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); + free(seqs[i].sam); +#endif + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); } free(seqs); } @@ -255,6 +304,10 @@ int main_mem(int argc, char *argv[]) bwa_idx_destroy(idx); kseq_destroy(ks); err_gzclose(fp); kclose(ko); +#ifdef USE_HTSLIB + sam_close(out); + bam_hdr_destroy(h); +#endif if (ks2) { kseq_destroy(ks2); err_gzclose(fp2); kclose(ko2); diff --git a/main.c b/main.c index b3ce197..3947d6e 100644 --- a/main.c +++ b/main.c @@ -86,8 +86,15 @@ int main(int argc, char *argv[]) fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } +#ifdef USE_HTSLIB + if (strcmp(argv[1], "mem") != 0) { + err_fflush(stdout); + err_fclose(stdout); + } +#else err_fflush(stdout); err_fclose(stdout); +#endif if (ret == 0) { fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); fprintf(stderr, "[%s] CMD:", __func__);