r431: added the MD tag to bwa-mem

This commit is contained in:
Heng Li 2014-01-29 12:05:11 -05:00
parent ea3dc2f003
commit f524c7d3d8
3 changed files with 40 additions and 35 deletions

27
bwa.c
View File

@ -6,6 +6,7 @@
#include "bwa.h"
#include "ksw.h"
#include "utils.h"
#include "kstring.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
@ -91,6 +92,8 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
uint8_t tmp, *rseq;
int i;
int64_t rlen;
kstring_t str;
*n_cigar = 0; *NM = -1;
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
@ -122,18 +125,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
}
{// compute NM
int k, x, y, n_mm = 0, n_gap = 0;
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
int k, x, y, u, n_mm = 0, n_gap = 0;
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
int op = cigar[k]&0xf;
int len = cigar[k]>>4;
if (op == 0) { // match
for (i = 0; i < len; ++i)
if (query[x + i] != rseq[y + i]) ++n_mm;
for (i = 0; i < len; ++i) {
if (query[x + i] != rseq[y + i]) {
kputw(u, &str); kputc("ACGTN"[rseq[y+i]], &str);
++n_mm; u = 0;
} else ++u;
}
x += len; y += len;
} else if (op == 1) x += len, n_gap += len;
else if (op == 2) y += len, n_gap += len;
} else if (op == 2) { // deletion
kputw(u, &str); kputc('^', &str);
for (i = 0; i < len; ++i)
kputc("ACGTN"[rseq[y+i]], &str);
u = 0;
y += len, n_gap += len;
} else if (op == 1) x += len, n_gap += len; // insertion
}
kputw(u, &str); kputc(0, &str);
*NM = n_mm + n_gap;
cigar = (uint32_t*)str.s;
}
if (rb >= l_pac) // reverse back query
for (i = 0; i < l_query>>1; ++i)

View File

@ -772,7 +772,10 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
}
// print optional tags
if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); }
if (p->n_cigar) {
kputsn("\tNM:i:", 6, str); kputw(p->NM, str);
kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
}
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
@ -910,7 +913,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
{
mem_aln_t a;
int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30);
int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
int64_t pos, rb, re;
uint8_t *query;
@ -943,27 +946,34 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
last_sc = score;
w2 <<= 1;
} while (++i < 3 && score < ar->truesc - opt->a);
l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
a.NM = NM;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
a.is_rev = is_rev;
if (a.n_cigar > 0) {
if (a.n_cigar > 0) { // squeeze out leading or trailing deletions
if ((a.cigar[0]&0xf) == 2) {
pos += a.cigar[0]>>4;
--a.n_cigar;
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4);
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar;
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD);
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) {
--a.n_cigar;
memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly
}
}
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
int clip5, clip3;
clip5 = is_rev? l_query - qe : qb;
clip3 = is_rev? qb : l_query - qe;
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD);
if (clip5) {
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping
a.cigar[0] = clip5<<4 | 3;
++a.n_cigar;
}
if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3;
if (clip3) {
memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping
a.cigar[a.n_cigar++] = clip3<<4 | 3;
}
}
a.rid = bns_pos2rid(bns, pos);
a.pos = pos - bns->anns[a.rid].offset;

22
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.5a-r430"
#define PACKAGE_VERSION "0.7.5a-r431"
#endif
int bwa_fa2pac(int argc, char *argv[]);
@ -14,17 +14,9 @@ int bwa_bwt2sa(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
int bwa_aln(int argc, char *argv[]);
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
char *bwa_pg;
static int usage()
@ -37,11 +29,6 @@ static int usage()
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
fprintf(stderr, " aln gapped/ungapped alignment\n");
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, "\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
@ -73,15 +60,8 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;