r308: added a new API to convert region to CIGAR
and an example program demonstrating how to do single-end alignment in <50 lines of C code.
This commit is contained in:
parent
64d92d26df
commit
df7c3f0000
5
Makefile
5
Makefile
|
|
@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli
|
||||||
is.o bwtindex.o bwape.o \
|
is.o bwtindex.o bwape.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||||
PROG= bwa
|
PROG= bwa bwamem-lite
|
||||||
INCLUDES=
|
INCLUDES=
|
||||||
LIBS= -lm -lz -lpthread
|
LIBS= -lm -lz -lpthread
|
||||||
SUBDIRS= .
|
SUBDIRS= .
|
||||||
|
|
@ -25,6 +25,9 @@ all:$(PROG)
|
||||||
bwa:libbwa.a $(AOBJS) main.o
|
bwa:libbwa.a $(AOBJS) main.o
|
||||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
||||||
|
|
||||||
|
bwamem-lite:libbwa.a example.o
|
||||||
|
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
|
||||||
|
|
||||||
libbwa.a:$(LOBJS)
|
libbwa.a:$(LOBJS)
|
||||||
$(AR) -csru $@ $(LOBJS)
|
$(AR) -csru $@ $(LOBJS)
|
||||||
|
|
||||||
|
|
|
||||||
34
bwamem.c
34
bwamem.c
|
|
@ -679,7 +679,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
|
||||||
s->sam = str.s;
|
s->sam = str.s;
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
mem_chain_v chn;
|
mem_chain_v chn;
|
||||||
|
|
@ -703,6 +703,32 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
|
||||||
return regs;
|
return regs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||||
|
{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
|
||||||
|
mem_alnreg_v ar;
|
||||||
|
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
|
||||||
|
mem_mark_primary_se(opt, ar.n, ar.a);
|
||||||
|
return ar;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This routine is only used for the API purpose
|
||||||
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar)
|
||||||
|
{
|
||||||
|
mem_aln_t a;
|
||||||
|
int qb = ar->qb, qe = ar->qe, NM, score, is_rev;
|
||||||
|
int64_t pos, rb = ar->rb, re = ar->re;
|
||||||
|
memset(&a, 0, sizeof(mem_aln_t));
|
||||||
|
a.mapq = mem_approx_mapq_se(opt, ar);
|
||||||
|
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
||||||
|
a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, opt->w, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
|
||||||
|
a.NM = NM;
|
||||||
|
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||||
|
a.is_rev = is_rev;
|
||||||
|
a.rid = bns_pos2rid(bns, pos);
|
||||||
|
a.pos = pos - bns->anns[a.rid].offset;
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int start, step, n;
|
int start, step, n;
|
||||||
const mem_opt_t *opt;
|
const mem_opt_t *opt;
|
||||||
|
|
@ -720,11 +746,11 @@ static void *worker1(void *data)
|
||||||
int i;
|
int i;
|
||||||
if (!(w->opt->flag&MEM_F_PE)) {
|
if (!(w->opt->flag&MEM_F_PE)) {
|
||||||
for (i = w->start; i < w->n; i += w->step)
|
for (i = w->start; i < w->n; i += w->step)
|
||||||
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
||||||
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
|
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
|
||||||
for (i = w->start; i < w->n>>1; i += w->step) {
|
for (i = w->start; i < w->n>>1; i += w->step) {
|
||||||
w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
||||||
w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
14
bwamem.h
14
bwamem.h
|
|
@ -49,19 +49,27 @@ typedef struct {
|
||||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||||
} mem_alnreg_t;
|
} mem_alnreg_t;
|
||||||
|
|
||||||
|
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int low, high, failed;
|
int low, high, failed;
|
||||||
double avg, std;
|
double avg, std;
|
||||||
} mem_pestat_t;
|
} mem_pestat_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
|
||||||
int64_t rb, re;
|
int64_t rb, re;
|
||||||
int qb, qe, flag, qual;
|
int qb, qe, flag, qual;
|
||||||
// optional info
|
// optional info
|
||||||
int score, sub;
|
int score, sub;
|
||||||
} bwahit_t;
|
} bwahit_t;
|
||||||
|
|
||||||
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
typedef struct { // This struct is only used for the convenience of API.
|
||||||
|
int rid;
|
||||||
|
int pos;
|
||||||
|
uint32_t is_rev:1, mapq:8, NM:23;
|
||||||
|
int n_cigar;
|
||||||
|
uint32_t *cigar;
|
||||||
|
} mem_aln_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
@ -114,6 +122,8 @@ extern "C" {
|
||||||
*/
|
*/
|
||||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
||||||
|
|
||||||
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Infer the insert size distribution from interleaved alignment regions
|
* Infer the insert size distribution from interleaved alignment regions
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <zlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include "bwamem.h"
|
||||||
|
#include "kseq.h" // for the FASTA/Q parser
|
||||||
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
bwaidx_t *idx;
|
||||||
|
gzFile fp;
|
||||||
|
kseq_t *ks;
|
||||||
|
mem_opt_t *opt;
|
||||||
|
|
||||||
|
if (argc < 3) {
|
||||||
|
fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
|
||||||
|
assert(idx);
|
||||||
|
fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
|
||||||
|
assert(fp);
|
||||||
|
ks = kseq_init(fp); // initialize the FASTA/Q parser
|
||||||
|
opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
|
||||||
|
|
||||||
|
while (kseq_read(ks) >= 0) { // read one sequence
|
||||||
|
mem_alnreg_v ar;
|
||||||
|
int i, k;
|
||||||
|
ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
|
||||||
|
for (i = 0; i < ar.n; ++i) { // traverse each hit
|
||||||
|
mem_aln_t a;
|
||||||
|
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
||||||
|
a = mem_reg2aln(opt, idx->bns, idx->pac, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
||||||
|
printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
|
||||||
|
for (k = 0; k < a.n_cigar; ++k)
|
||||||
|
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
|
||||||
|
printf("\t%d\n", a.NM);
|
||||||
|
free(a.cigar); // don't forget to deallocate CIGAR
|
||||||
|
}
|
||||||
|
free(ar.a); // and deallocate the hit list
|
||||||
|
}
|
||||||
|
|
||||||
|
free(opt);
|
||||||
|
kseq_destroy(ks);
|
||||||
|
gzclose(fp);
|
||||||
|
bwa_idx_destroy(idx);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue