r431: added the MD tag to bwa-mem
This commit is contained in:
parent
ea3dc2f003
commit
f524c7d3d8
29
bwa.c
29
bwa.c
|
|
@ -6,6 +6,7 @@
|
||||||
#include "bwa.h"
|
#include "bwa.h"
|
||||||
#include "ksw.h"
|
#include "ksw.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "kstring.h"
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
#ifdef USE_MALLOC_WRAPPERS
|
||||||
# include "malloc_wrap.h"
|
# include "malloc_wrap.h"
|
||||||
|
|
@ -91,6 +92,8 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
||||||
uint8_t tmp, *rseq;
|
uint8_t tmp, *rseq;
|
||||||
int i;
|
int i;
|
||||||
int64_t rlen;
|
int64_t rlen;
|
||||||
|
kstring_t str;
|
||||||
|
|
||||||
*n_cigar = 0; *NM = -1;
|
*n_cigar = 0; *NM = -1;
|
||||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||||
|
|
@ -122,18 +125,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
||||||
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
||||||
}
|
}
|
||||||
{// compute NM
|
{// compute NM
|
||||||
int k, x, y, n_mm = 0, n_gap = 0;
|
int k, x, y, u, n_mm = 0, n_gap = 0;
|
||||||
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
|
||||||
|
for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
|
||||||
int op = cigar[k]&0xf;
|
int op = cigar[k]&0xf;
|
||||||
int len = cigar[k]>>4;
|
int len = cigar[k]>>4;
|
||||||
if (op == 0) { // match
|
if (op == 0) { // match
|
||||||
for (i = 0; i < len; ++i)
|
for (i = 0; i < len; ++i) {
|
||||||
if (query[x + i] != rseq[y + i]) ++n_mm;
|
if (query[x + i] != rseq[y + i]) {
|
||||||
x += len; y += len;
|
kputw(u, &str); kputc("ACGTN"[rseq[y+i]], &str);
|
||||||
} else if (op == 1) x += len, n_gap += len;
|
++n_mm; u = 0;
|
||||||
else if (op == 2) y += len, n_gap += len;
|
} else ++u;
|
||||||
}
|
}
|
||||||
|
x += len; y += len;
|
||||||
|
} else if (op == 2) { // deletion
|
||||||
|
kputw(u, &str); kputc('^', &str);
|
||||||
|
for (i = 0; i < len; ++i)
|
||||||
|
kputc("ACGTN"[rseq[y+i]], &str);
|
||||||
|
u = 0;
|
||||||
|
y += len, n_gap += len;
|
||||||
|
} else if (op == 1) x += len, n_gap += len; // insertion
|
||||||
|
}
|
||||||
|
kputw(u, &str); kputc(0, &str);
|
||||||
*NM = n_mm + n_gap;
|
*NM = n_mm + n_gap;
|
||||||
|
cigar = (uint32_t*)str.s;
|
||||||
}
|
}
|
||||||
if (rb >= l_pac) // reverse back query
|
if (rb >= l_pac) // reverse back query
|
||||||
for (i = 0; i < l_query>>1; ++i)
|
for (i = 0; i < l_query>>1; ++i)
|
||||||
|
|
|
||||||
26
bwamem.c
26
bwamem.c
|
|
@ -772,7 +772,10 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
|
||||||
}
|
}
|
||||||
|
|
||||||
// print optional tags
|
// print optional tags
|
||||||
if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); }
|
if (p->n_cigar) {
|
||||||
|
kputsn("\tNM:i:", 6, str); kputw(p->NM, str);
|
||||||
|
kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
|
||||||
|
}
|
||||||
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
||||||
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
||||||
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
|
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
|
||||||
|
|
@ -910,7 +913,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
|
||||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||||||
{
|
{
|
||||||
mem_aln_t a;
|
mem_aln_t a;
|
||||||
int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30);
|
int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
|
||||||
int64_t pos, rb, re;
|
int64_t pos, rb, re;
|
||||||
uint8_t *query;
|
uint8_t *query;
|
||||||
|
|
||||||
|
|
@ -943,27 +946,34 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
||||||
last_sc = score;
|
last_sc = score;
|
||||||
w2 <<= 1;
|
w2 <<= 1;
|
||||||
} while (++i < 3 && score < ar->truesc - opt->a);
|
} while (++i < 3 && score < ar->truesc - opt->a);
|
||||||
|
l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
|
||||||
a.NM = NM;
|
a.NM = NM;
|
||||||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||||
a.is_rev = is_rev;
|
a.is_rev = is_rev;
|
||||||
if (a.n_cigar > 0) {
|
if (a.n_cigar > 0) { // squeeze out leading or trailing deletions
|
||||||
if ((a.cigar[0]&0xf) == 2) {
|
if ((a.cigar[0]&0xf) == 2) {
|
||||||
pos += a.cigar[0]>>4;
|
pos += a.cigar[0]>>4;
|
||||||
--a.n_cigar;
|
--a.n_cigar;
|
||||||
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4);
|
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD);
|
||||||
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar;
|
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) {
|
||||||
|
--a.n_cigar;
|
||||||
|
memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
|
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
|
||||||
int clip5, clip3;
|
int clip5, clip3;
|
||||||
clip5 = is_rev? l_query - qe : qb;
|
clip5 = is_rev? l_query - qe : qb;
|
||||||
clip3 = is_rev? qb : l_query - qe;
|
clip3 = is_rev? qb : l_query - qe;
|
||||||
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
|
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD);
|
||||||
if (clip5) {
|
if (clip5) {
|
||||||
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping
|
||||||
a.cigar[0] = clip5<<4 | 3;
|
a.cigar[0] = clip5<<4 | 3;
|
||||||
++a.n_cigar;
|
++a.n_cigar;
|
||||||
}
|
}
|
||||||
if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3;
|
if (clip3) {
|
||||||
|
memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping
|
||||||
|
a.cigar[a.n_cigar++] = clip3<<4 | 3;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
a.rid = bns_pos2rid(bns, pos);
|
a.rid = bns_pos2rid(bns, pos);
|
||||||
a.pos = pos - bns->anns[a.rid].offset;
|
a.pos = pos - bns->anns[a.rid].offset;
|
||||||
|
|
|
||||||
22
main.c
22
main.c
|
|
@ -4,7 +4,7 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#ifndef PACKAGE_VERSION
|
#ifndef PACKAGE_VERSION
|
||||||
#define PACKAGE_VERSION "0.7.5a-r430"
|
#define PACKAGE_VERSION "0.7.5a-r431"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int bwa_fa2pac(int argc, char *argv[]);
|
int bwa_fa2pac(int argc, char *argv[]);
|
||||||
|
|
@ -14,17 +14,9 @@ int bwa_bwt2sa(int argc, char *argv[]);
|
||||||
int bwa_index(int argc, char *argv[]);
|
int bwa_index(int argc, char *argv[]);
|
||||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||||
|
|
||||||
int bwa_aln(int argc, char *argv[]);
|
|
||||||
int bwa_sai2sam_se(int argc, char *argv[]);
|
|
||||||
int bwa_sai2sam_pe(int argc, char *argv[]);
|
|
||||||
|
|
||||||
int bwa_bwtsw2(int argc, char *argv[]);
|
|
||||||
|
|
||||||
int main_fastmap(int argc, char *argv[]);
|
int main_fastmap(int argc, char *argv[]);
|
||||||
int main_mem(int argc, char *argv[]);
|
int main_mem(int argc, char *argv[]);
|
||||||
|
|
||||||
int main_pemerge(int argc, char *argv[]);
|
|
||||||
|
|
||||||
char *bwa_pg;
|
char *bwa_pg;
|
||||||
|
|
||||||
static int usage()
|
static int usage()
|
||||||
|
|
@ -37,11 +29,6 @@ static int usage()
|
||||||
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
||||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||||
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
|
|
||||||
fprintf(stderr, " aln gapped/ungapped alignment\n");
|
|
||||||
fprintf(stderr, " samse generate alignment (single ended)\n");
|
|
||||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
|
||||||
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||||
|
|
@ -73,15 +60,8 @@ int main(int argc, char *argv[])
|
||||||
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
|
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
|
||||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
|
||||||
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
|
|
||||||
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
|
||||||
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
|
||||||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
|
||||||
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue