added a bit documentation

This commit is contained in:
Heng Li 2013-02-24 13:09:29 -05:00
parent ee59a13109
commit 6bdccf2a8a
4 changed files with 68 additions and 14 deletions

1
bwa.h
View File

@ -34,6 +34,7 @@ extern "C" {
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);

View File

@ -6,6 +6,7 @@
#ifdef HAVE_PTHREAD
#include <pthread.h>
#endif
#include "kstring.h"
#include "bwamem.h"
#include "bntseq.h"
@ -632,19 +633,19 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
s->sam = str.s;
}
static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s)
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
{
int i, j;
mem_chain_v chn;
mem_alnreg_v regs, tmp;
for (i = 0; i < s->l_seq; ++i)
s->seq[i] = nst_nt4_table[(int)s->seq[i]];
chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq);
for (i = 0; i < l_seq; ++i)
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq);
chn.n = mem_chain_flt(opt, chn.n, chn.a);
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
kv_init(regs); kv_init(tmp);
for (i = 0; i < chn.n; ++i) {
mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp);
mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, &chn.a[i], &tmp);
for (j = 0; j < tmp.n; ++j)
kv_push(mem_alnreg_t, regs, tmp.a[j]);
free(chn.a[i].seeds);
@ -670,7 +671,7 @@ static void *worker1(void *data)
worker_t *w = (worker_t*)data;
int i;
for (i = w->start; i < w->n; i += w->step)
w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]);
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
return 0;
}
@ -696,7 +697,7 @@ static void *worker2(void *data)
return 0;
}
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
{
int i;
worker_t *w;
@ -737,5 +738,4 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
}
free(regs); free(w);
return 0;
}

View File

@ -31,10 +31,14 @@ typedef struct {
} mem_opt_t;
typedef struct {
int64_t rb, re;
int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain
int sub_n; // approximate number of suboptimal hits
int secondary; // non-negative if the hit is secondary
int64_t rb, re; // [rb,re): reference sequence in the alignment
int qb, qe; // [qb,qe): query sequence in the alignment
int score; // best SW score
int sub; // 2nd best SW score
int csub; // SW score of a tandem hit
int sub_n; // approximate number of suboptimal hits
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
} mem_alnreg_t;
typedef struct {
@ -63,8 +67,57 @@ extern "C" {
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);
int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
/**
* Align a batch of sequences and generate the alignments in the SAM format
*
* This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
* Note that $seqs[i].sam may consist of several SAM lines if the
* corresponding sequence has multiple primary hits.
*
* In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
* sequences must be interleaved: $n must be an even number and the 2i-th
* sequence and the (2i+1)-th sequence constitute a read pair. In this
* mode, there should be enough (typically >50) unique pairs for the
* routine to infer the orientation and insert size.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param n number of query sequences
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR. CIGAR should be
* generated later by bwa_gen_cigar() defined in bwa.c.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param l_seq length of query sequence
* @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied
*
* @return list of aligned regions.
*/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
/**
* Infer the insert size distribution from interleaved alignment regions
*
* This function can be called after mem_align1(), as long as paired-end
* reads are properly interleaved.
*
* @param opt alignment parameters
* @param l_pac length of concatenated reference sequence
* @param n number of query sequences; must be an even number
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
* @param pes inferred insert size distribution (output)
*/
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
#ifdef __cplusplus

View File

@ -67,7 +67,7 @@ int main_mem(int argc, char *argv[])
ks2 = kseq_init(fp2);
opt->flag |= MEM_F_PE;
}
while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) {
while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) {
int64_t size = 0;
if (!copy_comment)
for (i = 0; i < n; ++i) {