From 6bdccf2a8acf8c8b4d6c397ab0c6e9b7f9906ae7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:09:29 -0500 Subject: [PATCH] added a bit documentation --- bwa.h | 1 + bwamem.c | 16 +++++++------- bwamem.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++----- fastmap.c | 2 +- 4 files changed, 68 insertions(+), 14 deletions(-) diff --git a/bwa.h b/bwa.h index 208db6a..d4ca807 100644 --- a/bwa.h +++ b/bwa.h @@ -34,6 +34,7 @@ extern "C" { char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); diff --git a/bwamem.c b/bwamem.c index b6741d2..4fffe38 100644 --- a/bwamem.c +++ b/bwamem.c @@ -6,6 +6,7 @@ #ifdef HAVE_PTHREAD #include #endif + #include "kstring.h" #include "bwamem.h" #include "bntseq.h" @@ -632,19 +633,19 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i, j; mem_chain_v chn; mem_alnreg_v regs, tmp; - for (i = 0; i < s->l_seq; ++i) - s->seq[i] = nst_nt4_table[(int)s->seq[i]]; - chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); + for (i = 0; i < l_seq; ++i) + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, &chn.a[i], &tmp); for (j = 0; j < tmp.n; ++j) kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); @@ -670,7 +671,7 @@ static void *worker1(void *data) worker_t *w = (worker_t*)data; int i; for (i = w->start; i < w->n; i += w->step) - w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); return 0; } @@ -696,7 +697,7 @@ static void *worker2(void *data) return 0; } -int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; worker_t *w; @@ -737,5 +738,4 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } free(regs); free(w); - return 0; } diff --git a/bwamem.h b/bwamem.h index 27a3dc1..fa55b44 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,10 +31,14 @@ typedef struct { } mem_opt_t; typedef struct { - int64_t rb, re; - int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n; // approximate number of suboptimal hits - int secondary; // non-negative if the hit is secondary + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int score; // best SW score + int sub; // 2nd best SW score + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; typedef struct { @@ -63,8 +67,57 @@ extern "C" { mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); - int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + /** + * Align a batch of sequences and generate the alignments in the SAM format + * + * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. + * Note that $seqs[i].sam may consist of several SAM lines if the + * corresponding sequence has multiple primary hits. + * + * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query + * sequences must be interleaved: $n must be an even number and the 2i-th + * sequence and the (2i+1)-th sequence constitute a read pair. In this + * mode, there should be enough (typically >50) unique pairs for the + * routine to infer the orientation and insert size. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param n number of query sequences + * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + */ + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + /** + * Find the aligned regions for one query sequence + * + * Note that this routine does not generate CIGAR. CIGAR should be + * generated later by bwa_gen_cigar() defined in bwa.c. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied + * + * @return list of aligned regions. + */ + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + + /** + * Infer the insert size distribution from interleaved alignment regions + * + * This function can be called after mem_align1(), as long as paired-end + * reads are properly interleaved. + * + * @param opt alignment parameters + * @param l_pac length of concatenated reference sequence + * @param n number of query sequences; must be an even number + * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair + * @param pes inferred insert size distribution (output) + */ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 90307b3..819e301 100644 --- a/fastmap.c +++ b/fastmap.c @@ -67,7 +67,7 @@ int main_mem(int argc, char *argv[]) ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } - while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { int64_t size = 0; if (!copy_comment) for (i = 0; i < n; ++i) {