diff --git a/bwa.c b/bwa.c index 33edd7f..21b71b8 100644 --- a/bwa.c +++ b/bwa.c @@ -6,6 +6,7 @@ #include "bwa.h" #include "ksw.h" #include "utils.h" +#include "kstring.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -91,6 +92,8 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa uint8_t tmp, *rseq; int i; int64_t rlen; + kstring_t str; + *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); @@ -122,18 +125,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); } {// compute NM - int k, x, y, n_mm = 0, n_gap = 0; - for (k = 0, x = y = 0; k < *n_cigar; ++k) { + int k, x, y, u, n_mm = 0, n_gap = 0; + str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR + for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { int op = cigar[k]&0xf; int len = cigar[k]>>4; if (op == 0) { // match - for (i = 0; i < len; ++i) - if (query[x + i] != rseq[y + i]) ++n_mm; + for (i = 0; i < len; ++i) { + if (query[x + i] != rseq[y + i]) { + kputw(u, &str); kputc("ACGTN"[rseq[y+i]], &str); + ++n_mm; u = 0; + } else ++u; + } x += len; y += len; - } else if (op == 1) x += len, n_gap += len; - else if (op == 2) y += len, n_gap += len; + } else if (op == 2) { // deletion + kputw(u, &str); kputc('^', &str); + for (i = 0; i < len; ++i) + kputc("ACGTN"[rseq[y+i]], &str); + u = 0; + y += len, n_gap += len; + } else if (op == 1) x += len, n_gap += len; // insertion } + kputw(u, &str); kputc(0, &str); *NM = n_mm + n_gap; + cigar = (uint32_t*)str.s; } if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) diff --git a/bwamem.c b/bwamem.c index 27b9c4c..6f77064 100644 --- a/bwamem.c +++ b/bwamem.c @@ -772,7 +772,10 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } // print optional tags - if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); } + if (p->n_cigar) { + kputsn("\tNM:i:", 6, str); kputw(p->NM, str); + kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); + } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } @@ -910,7 +913,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30); + int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; int64_t pos, rb, re; uint8_t *query; @@ -943,27 +946,34 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); + l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; - if (a.n_cigar > 0) { + if (a.n_cigar > 0) { // squeeze out leading or trailing deletions if ((a.cigar[0]&0xf) == 2) { pos += a.cigar[0]>>4; --a.n_cigar; - memmove(a.cigar, a.cigar + 1, a.n_cigar * 4); - } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) { + --a.n_cigar; + memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly + } } if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; clip3 = is_rev? qb : l_query - qe; - a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD); if (clip5) { - memmove(a.cigar+1, a.cigar, a.n_cigar * 4); + memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping a.cigar[0] = clip5<<4 | 3; ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3; + if (clip3) { + memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping + a.cigar[a.n_cigar++] = clip3<<4 | 3; + } } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; diff --git a/main.c b/main.c index 20cbf83..46e794d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r430" +#define PACKAGE_VERSION "0.7.5a-r431" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -14,17 +14,9 @@ int bwa_bwt2sa(int argc, char *argv[]); int bwa_index(int argc, char *argv[]); int bwt_bwtgen_main(int argc, char *argv[]); -int bwa_aln(int argc, char *argv[]); -int bwa_sai2sam_se(int argc, char *argv[]); -int bwa_sai2sam_pe(int argc, char *argv[]); - -int bwa_bwtsw2(int argc, char *argv[]); - int main_fastmap(int argc, char *argv[]); int main_mem(int argc, char *argv[]); -int main_pemerge(int argc, char *argv[]); - char *bwa_pg; static int usage() @@ -37,11 +29,6 @@ static int usage() fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); - fprintf(stderr, " aln gapped/ungapped alignment\n"); - fprintf(stderr, " samse generate alignment (single ended)\n"); - fprintf(stderr, " sampe generate alignment (paired ended)\n"); - fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -73,15 +60,8 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); - else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); - else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); - else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1;