r308: added a new API to convert region to CIGAR

and an example program demonstrating how to do single-end alignment in <50
lines of C code.
This commit is contained in:
Heng Li 2013-02-27 22:28:29 -05:00
parent 64d92d26df
commit df7c3f0000
5 changed files with 97 additions and 8 deletions

View File

@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli
is.o bwtindex.o bwape.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
PROG= bwa bwamem-lite
INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
@ -25,6 +25,9 @@ all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)

View File

@ -679,7 +679,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
s->sam = str.s;
}
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
{
int i;
mem_chain_v chn;
@ -703,6 +703,32 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
return regs;
}
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
mem_alnreg_v ar;
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
mem_mark_primary_se(opt, ar.n, ar.a);
return ar;
}
// This routine is only used for the API purpose
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar)
{
mem_aln_t a;
int qb = ar->qb, qe = ar->qe, NM, score, is_rev;
int64_t pos, rb = ar->rb, re = ar->re;
memset(&a, 0, sizeof(mem_aln_t));
a.mapq = mem_approx_mapq_se(opt, ar);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, opt->w, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
a.NM = NM;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
a.is_rev = is_rev;
a.rid = bns_pos2rid(bns, pos);
a.pos = pos - bns->anns[a.rid].offset;
return a;
}
typedef struct {
int start, step, n;
const mem_opt_t *opt;
@ -720,11 +746,11 @@ static void *worker1(void *data)
int i;
if (!(w->opt->flag&MEM_F_PE)) {
for (i = w->start; i < w->n; i += w->step)
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
for (i = w->start; i < w->n>>1; i += w->step) {
w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
}
}
return 0;

View File

@ -49,19 +49,27 @@ typedef struct {
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
} mem_alnreg_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
typedef struct {
int low, high, failed;
double avg, std;
} mem_pestat_t;
typedef struct {
typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
int64_t rb, re;
int qb, qe, flag, qual;
// optional info
int score, sub;
} bwahit_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
typedef struct { // This struct is only used for the convenience of API.
int rid;
int pos;
uint32_t is_rev:1, mapq:8, NM:23;
int n_cigar;
uint32_t *cigar;
} mem_aln_t;
#ifdef __cplusplus
extern "C" {
@ -114,6 +122,8 @@ extern "C" {
*/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar);
/**
* Infer the insert size distribution from interleaved alignment regions
*

50
example.c 100644
View File

@ -0,0 +1,50 @@
#include <stdio.h>
#include <zlib.h>
#include <string.h>
#include <assert.h>
#include "bwamem.h"
#include "kseq.h" // for the FASTA/Q parser
KSEQ_DECLARE(gzFile)
int main(int argc, char *argv[])
{
bwaidx_t *idx;
gzFile fp;
kseq_t *ks;
mem_opt_t *opt;
if (argc < 3) {
fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
return 1;
}
idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
assert(idx);
fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
assert(fp);
ks = kseq_init(fp); // initialize the FASTA/Q parser
opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
while (kseq_read(ks) >= 0) { // read one sequence
mem_alnreg_v ar;
int i, k;
ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
for (i = 0; i < ar.n; ++i) { // traverse each hit
mem_aln_t a;
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
a = mem_reg2aln(opt, idx->bns, idx->pac, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
for (k = 0; k < a.n_cigar; ++k)
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
printf("\t%d\n", a.NM);
free(a.cigar); // don't forget to deallocate CIGAR
}
free(ar.a); // and deallocate the hit list
}
free(opt);
kseq_destroy(ks);
gzclose(fp);
bwa_idx_destroy(idx);
return 0;
}

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.2-r306-beta"
#define PACKAGE_VERSION "0.6.2-r308-beta"
#endif
static int usage()