diff --git a/Makefile b/Makefile index 98d0eb6..eab4198 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o -PROG= bwa +PROG= bwa bwamem-lite INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . @@ -25,6 +25,9 @@ all:$(PROG) bwa:libbwa.a $(AOBJS) main.o $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa +bwamem-lite:libbwa.a example.o + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa + libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) diff --git a/bwamem.c b/bwamem.c index f5173d3..9950097 100644 --- a/bwamem.c +++ b/bwamem.c @@ -679,7 +679,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i; mem_chain_v chn; @@ -703,6 +703,32 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * return regs; } +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se() + mem_alnreg_v ar; + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); + mem_mark_primary_se(opt, ar.n, ar.a); + return ar; +} + +// This routine is only used for the API purpose +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar) +{ + mem_aln_t a; + int qb = ar->qb, qe = ar->qe, NM, score, is_rev; + int64_t pos, rb = ar->rb, re = ar->re; + memset(&a, 0, sizeof(mem_aln_t)); + a.mapq = mem_approx_mapq_se(opt, ar); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); + a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, opt->w, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + a.NM = NM; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + a.is_rev = is_rev; + a.rid = bns_pos2rid(bns, pos); + a.pos = pos - bns->anns[a.rid].offset; + return a; +} + typedef struct { int start, step, n; const mem_opt_t *opt; @@ -720,11 +746,11 @@ static void *worker1(void *data) int i; if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) - w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower for (i = w->start; i < w->n>>1; i += w->step) { - w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); - w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); } } return 0; diff --git a/bwamem.h b/bwamem.h index 5c63402..9996f6b 100644 --- a/bwamem.h +++ b/bwamem.h @@ -49,19 +49,27 @@ typedef struct { int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + typedef struct { int low, high, failed; double avg, std; } mem_pestat_t; -typedef struct { +typedef struct { // TODO: This is an intermediate struct only. Better get rid of it. int64_t rb, re; int qb, qe, flag, qual; // optional info int score, sub; } bwahit_t; -typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; +typedef struct { // This struct is only used for the convenience of API. + int rid; + int pos; + uint32_t is_rev:1, mapq:8, NM:23; + int n_cigar; + uint32_t *cigar; +} mem_aln_t; #ifdef __cplusplus extern "C" { @@ -114,6 +122,8 @@ extern "C" { */ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar); + /** * Infer the insert size distribution from interleaved alignment regions * diff --git a/example.c b/example.c new file mode 100644 index 0000000..2fefde4 --- /dev/null +++ b/example.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include "bwamem.h" +#include "kseq.h" // for the FASTA/Q parser +KSEQ_DECLARE(gzFile) + +int main(int argc, char *argv[]) +{ + bwaidx_t *idx; + gzFile fp; + kseq_t *ks; + mem_opt_t *opt; + + if (argc < 3) { + fprintf(stderr, "Usage: bwamem-lite \n"); + return 1; + } + + idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index + assert(idx); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + assert(fp); + ks = kseq_init(fp); // initialize the FASTA/Q parser + opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values + + while (kseq_read(ks) >= 0) { // read one sequence + mem_alnreg_v ar; + int i, k; + ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits + for (i = 0; i < ar.n; ++i) { // traverse each hit + mem_aln_t a; + if (ar.a[i].secondary >= 0) continue; // skip secondary alignments + a = mem_reg2aln(opt, idx->bns, idx->pac, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); + for (k = 0; k < a.n_cigar; ++k) + printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); + printf("\t%d\n", a.NM); + free(a.cigar); // don't forget to deallocate CIGAR + } + free(ar.a); // and deallocate the hit list + } + + free(opt); + kseq_destroy(ks); + gzclose(fp); + bwa_idx_destroy(idx); + return 0; +} diff --git a/main.c b/main.c index 0590e63..0009fc6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r306-beta" +#define PACKAGE_VERSION "0.6.2-r308-beta" #endif static int usage()