From 98787f0ae064241baeebf8cd394913a2b1cc2587 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:36:01 -0500 Subject: [PATCH] r295: generate NM --- bwa.c | 22 ++++++++++++++++++---- bwa.h | 2 +- bwamem.c | 5 +++-- fastmap.c | 3 ++- main.c | 2 +- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/bwa.c b/bwa.c index e8221ca..aef2ec8 100644 --- a/bwa.c +++ b/bwa.c @@ -70,13 +70,13 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) *****************/ // Generate CIGAR when the alignment end points are known -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { uint32_t *cigar = 0; uint8_t tmp, *rseq; int i, w; int64_t rlen; - *n_cigar = 0; + *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range @@ -95,6 +95,20 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa w += abs(rlen - l_query); // NW alignment *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + {// compute NM + int k, x, y, n_mm = 0, n_gap = 0; + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + int op = cigar[k]&0xf; + int len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) + if (query[x + i] != rseq[y + i]) ++n_mm; + x += len; y += len; + } else if (op == 1) x += len, n_gap += len; + else if (op == 2) y += len, n_gap += len; + } + *NM = n_mm + n_gap; + } if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; @@ -126,10 +140,10 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, } } if (mid >= 0) { - int i, score, n_cigar, y; + int i, score, n_cigar, y, NM; uint32_t *cigar; int64_t x; - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar); + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { int op = cigar[i]&0xf, len = cigar[i]>>4; if (op == 0) { diff --git a/bwa.h b/bwa.h index 2d6c7bf..81d40e0 100644 --- a/bwa.h +++ b/bwa.h @@ -30,7 +30,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); diff --git a/bwamem.c b/bwamem.c index 8d8402b..156e9b7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -496,7 +496,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0; + int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; uint32_t *cigar = 0; int64_t pos; bwahit_t ptmp, *p = &ptmp; @@ -519,7 +519,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); @@ -582,6 +582,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons str->s[str->l] = 0; } else kputc('*', str); } + if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } diff --git a/fastmap.c b/fastmap.c index b4f8ea8..81ce665 100644 --- a/fastmap.c +++ b/fastmap.c @@ -47,6 +47,7 @@ int main_mem(int argc, char *argv[]) if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); } + if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -94,7 +95,7 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if (!copy_comment) for (i = 0; i < n; ++i) { diff --git a/main.c b/main.c index 80c9fb1..a33830b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r294-beta" +#define PACKAGE_VERSION "0.6.2-r295-beta" #endif static int usage()