From 8f0d43991356bd26bcb8fc9fcea4bdbba566373e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:25:17 -0400 Subject: [PATCH 01/12] prepare to replace the SAM printing code This move is dangerous as SAM printing is very complex, but it will benefit in the long run. The planned change will reduce the redundancy, improves clarity and most importantly makes it much easier to output multiple primary hits in an optional tag. --- bwamem.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++--- bwamem.h | 8 +++-- kstring.h | 17 +++++++++ 3 files changed, 124 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9350943..e1b6205 100644 --- a/bwamem.c +++ b/bwamem.c @@ -714,6 +714,95 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #undef is_mapped } +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) +{ + int i, copy_mate = 0; + mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert + + // set flag + p->flag |= m? 0x1 : 0; // is paired in sequencing + p->flag |= p->rid < 0? 4 : 0; // is mapped + p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1; + p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand + p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand + if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read + + // print up to CIGAR + kputs(s->name, str); kputc('\t', str); // QNAME + kputw(p->flag, str); kputc('\t', str); // FLAG + if (p->rid >= 0) { // with coordinate + kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME + kputl(p->pos + 1, str); kputc('\t', str); // POS + kputw(p->mapq, str); kputc('\t', str); // MAPQ + if (p->n_cigar) { // aligned + for (i = 0; i < p->n_cigar; ++i) { + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str); + } + } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) + } else kputsn("*\t0\t0\t*", 7, str); // without coordinte + kputc('\t', str); + + // print the mate position if applicable + if (m && m->rid) { + if (p->rid == m->rid) kputc('=', str); + else kputs(bns->anns[m->rid].name, str); + kputc('\t', str); + kputl(m->pos + 1, str); + if (p->rid == m->rid) { + int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5; + int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5; + kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + } else kputc('0', str); + } else if (m && p->rid) { + kputsn("\t=\t", 3, str); + kputl(p->pos + 1, str); + kputsn("\t0\t", 3, str); + } else kputsn("*\t0\t0\t", 6, str); + + // print SEQ and QUAL + if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!p->is_rev) { // the forward strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + + // print optional tags + if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + kputc('\n', str); +} + /************************ * Integrated interface * ************************/ @@ -816,14 +905,20 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; - int64_t pos, rb = ar->rb, re = ar->re; + int i, w2, qb, qe, NM, score, is_rev; + int64_t pos, rb, re; uint8_t *query; + memset(&a, 0, sizeof(mem_aln_t)); + if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; + } + qb = ar->qb, qe = ar->qe; + rb = ar->rb, re = ar->re; query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); @@ -839,13 +934,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4); - a.cigar[0] = clip5<<4|3; + a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; + if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; + a.r5 = rb; a.score = ar->score; a.sub = ar->sub; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index 14f04d5..dce10b3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -67,11 +67,15 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of } bwahit_t; typedef struct { // This struct is only used for the convenience of API. - int rid; // reference sequence index in bntseq_t - int pos; // forward strand 5'-end mapping position + int64_t pos; // forward strand 5'-end mapping position + int rid; // reference sequence index in bntseq_t; <0 for unmapped + int flag; // extra flag uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + + int64_t r5; // position of the 5'-end of read (for computing TLEN) + int score, sub; } mem_aln_t; #ifdef __cplusplus diff --git a/kstring.h b/kstring.h index 81d7d60..04f1c42 100644 --- a/kstring.h +++ b/kstring.h @@ -89,6 +89,23 @@ static inline int kputuw(unsigned c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif From 47952b6f3ff3fd3ca48285ffc16f6535bb0b69aa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:35:32 -0400 Subject: [PATCH 02/12] drop an unnecessary member from mem_aln_t --- bwamem.c | 17 ++++++++++++++--- bwamem.h | 1 - 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index e1b6205..3305658 100644 --- a/bwamem.c +++ b/bwamem.c @@ -714,6 +714,17 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #undef is_mapped } +static inline int get_rlen(int n_cigar, const uint32_t *cigar) +{ + int k, l; + for (k = l = 0; k < n_cigar; ++k) { + int op = cigar[k]&0xf; + if (op == 0 || op == 2) + l += cigar[k]>>4; + } + return l; +} + void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) { int i, copy_mate = 0; @@ -751,8 +762,8 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputc('\t', str); kputl(m->pos + 1, str); if (p->rid == m->rid) { - int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5; - int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5; + int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) : 0); + int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) : 0); kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputc('0', str); } else if (m && p->rid) { @@ -941,7 +952,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; - a.r5 = rb; a.score = ar->score; a.sub = ar->sub; + a.score = ar->score; a.sub = ar->sub; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index dce10b3..55cff76 100644 --- a/bwamem.h +++ b/bwamem.h @@ -74,7 +74,6 @@ typedef struct { // This struct is only used for the convenience of API. int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 - int64_t r5; // position of the 5'-end of read (for computing TLEN) int score, sub; } mem_aln_t; From c7edaa8e84eadceae0fd9bfc475ad0d6faf7c936 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:55:52 -0400 Subject: [PATCH 03/12] to test the new sam writer... --- bwamem.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 3305658..7d45c44 100644 --- a/bwamem.c +++ b/bwamem.c @@ -742,7 +742,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m // print up to CIGAR kputs(s->name, str); kputc('\t', str); // QNAME - kputw(p->flag, str); kputc('\t', str); // FLAG + kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG if (p->rid >= 0) { // with coordinate kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME kputl(p->pos + 1, str); kputc('\t', str); // POS @@ -874,6 +874,38 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } +void mem_aln2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +{ + kstring_t str; + str.l = str.m = 0; str.s = 0; + if (a->n > 0 && a->a[0].score >= opt->T) { + int k; + kvec_t(mem_aln_t) aa; + kv_init(aa); + for (k = 0; k < a->n; ++k) { + mem_alnreg_t *p = &a->a[k]; + mem_aln_t *q; + if (p->score < opt->T) continue; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + q->flag |= extra_flag; + if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; + if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + } + for (k = 0; k < aa.n; ++k) + mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m); + for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + free(aa.a); + } else { + mem_aln_t t; + t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + mem_aln2sam(bns, &str, s, 1, &t, 0, m); + } + s->sam = str.s; +} + mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i; From ebb45dc42ecbdef77ffd5cd45186f0ef74feb5c1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:59:15 -0400 Subject: [PATCH 04/12] new code works for SE --- bwamem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7d45c44..38ab6e6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -874,7 +874,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -void mem_aln2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { kstring_t str; str.l = str.m = 0; str.s = 0; @@ -1024,7 +1024,8 @@ static void *worker2(void *data) if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + //mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } } else { From 0b0455ca51a7291ea16caf4113c33c03c3691251 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 22:18:23 -0400 Subject: [PATCH 05/12] replace PE; BUGGY right now!! --- bwamem_pair.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index b9a68f1..7cbf5e0 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -230,15 +230,14 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); - extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); - extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); + extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); + extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); - int n = 0, i, j, z[2], o, subo, n_sub; + int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; kstring_t str; mem_alnreg_v b[2]; - bwahit_t h[2]; + mem_aln_t h[2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment @@ -256,7 +255,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { - int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; + int is_multi[2], q_pe, score_un, q_se[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) @@ -292,27 +291,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } - mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re); - mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re); - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; + // write SAM + h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; + h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; + mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; } else goto no_pairing; return n; no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n && a[i].a[0].score >= opt->T) { - mem_alnreg2hit(&a[i].a[0], &h[i]); - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); - } else { - memset(&h[i], 0, sizeof(bwahit_t)); - h[i].rb = h[i].re = -1; - h[i].flag = 1<<(6+i) | 1; - } + if (a[i].n && a[i].a[0].score >= opt->T) + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); + else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } - mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); - mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); + mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); + mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); return n; } From 26f4c704ed9dcaffb057163636165e52cf6f6b94 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 22:24:54 -0400 Subject: [PATCH 06/12] drop the old SAM writer --- bwamem.c | 148 -------------------------------------------------- bwamem.h | 7 --- bwamem_pair.c | 1 - 3 files changed, 156 deletions(-) diff --git a/bwamem.c b/bwamem.c index 38ab6e6..f780949 100644 --- a/bwamem.c +++ b/bwamem.c @@ -607,113 +607,6 @@ static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) return w; } -void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) -{ -#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; - uint32_t *cigar = 0; - int64_t pos = -1; - bwahit_t ptmp, *p = &ptmp; - - if (!p_) { // in this case, generate an unmapped alignment - memset(&ptmp, 0, sizeof(bwahit_t)); - ptmp.rb = ptmp.re = -1; - } else ptmp = *p_; - p->flag |= m? 1 : 0; // is paired in sequencing - p->flag |= !is_mapped(p)? 4 : 0; // is mapped - p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped - if (m && !is_mapped(p) && is_mapped(m)) { - p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq; - copy_mate = 1; - } - p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand - p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand - if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read - kputs(s->name, str); kputc('\t', str); - if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate - int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag - if (p->flag&0x10000) sam_flag |= 0x100; - if (!copy_mate) { - int w2; - w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r); - w2 = w2 < w? w2 : w; - cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); - p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) - } else n_cigar = 0, cigar = 0; - pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); - bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - kputw(sam_flag, str); kputc('\t', str); - kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); - kputw(p->qual, str); kputc('\t', str); - if (n_cigar) { - int i, clip5, clip3; - clip5 = is_rev? s->l_seq - p->qe : p->qb; - clip3 = is_rev? p->qb : s->l_seq - p->qe; - if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); } - for (i = 0; i < n_cigar; ++i) { - kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str); - } - if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } - } else kputc('*', str); - } else { // no coordinate - kputw(p->flag, str); - kputs("\t*\t0\t0\t*", str); - rid = -1; - } - if (m && is_mapped(m)) { // then print mate pos and isize - pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); - bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); - kputc('\t', str); - if (mid == rid) kputc('=', str); - else kputs(bns->anns[mid].name, str); - kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); - kputc('\t', str); - if (mid == rid) { - int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; - int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); - } else kputw(0, str); - kputc('\t', str); - } else if (m && is_mapped(p)) { // then copy the position - kputsn("\t=\t", 3, str); - kputuw(pos - bns->anns[rid].offset + 1, str); - kputsn("\t0\t", 3, str); - } else kputsn("\t*\t0\t0\t", 7, str); - if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL - kputsn("*\t*", 3, str); - } else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand - int i, qb = 0, qe = s->l_seq; - if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; - kputc('\t', str); - if (s->qual) { // printf qual - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; - str->s[str->l] = 0; - } else kputc('*', str); - } else { // the reverse strand - int i, qb = 0, qe = s->l_seq; - if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; - kputc('\t', str); - if (s->qual) { // printf qual - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; - str->s[str->l] = 0; - } else kputc('*', str); - } - if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); } - if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } - if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - if (s->comment) { kputc('\t', str); kputs(s->comment, str); } - kputc('\n', str); - free(cigar); -#undef is_mapped -} - static inline int get_rlen(int n_cigar, const uint32_t *cigar) { int k, l; @@ -834,46 +727,6 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) return mapq; } -void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) -{ - h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; - h->score = a->score; - h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub; - h->qual = 0; // quality unset - h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set -} - -void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m) -{ - int k; - kstring_t str; - str.l = str.m = 0; str.s = 0; - if (a->n > 0 && a->a[0].score >= opt->T) { - int mapq0 = -1; - for (k = 0; k < a->n; ++k) { - bwahit_t h; - mem_alnreg_t *p = &a->a[k]; - if (p->score < opt->T) continue; - if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; - mem_alnreg2hit(p, &h); - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re); - h.flag |= extra_flag; - if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) - h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); - if (k == 0) mapq0 = h.qual; - else if (h.qual > mapq0) h.qual = mapq0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); - } - } else { - bwahit_t h; - memset(&h, 0, sizeof(bwahit_t)); - h.rb = h.re = -1; h.flag = extra_flag; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); - } - s->sam = str.s; -} - void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { kstring_t str; @@ -1024,7 +877,6 @@ static void *worker2(void *data) if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - //mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } diff --git a/bwamem.h b/bwamem.h index 55cff76..ae201e6 100644 --- a/bwamem.h +++ b/bwamem.h @@ -59,13 +59,6 @@ typedef struct { double avg, std; } mem_pestat_t; -typedef struct { // TODO: This is an intermediate struct only. Better get rid of it. - int64_t rb, re; - int qb, qe, flag, qual; - // optional info - int score, sub; -} bwahit_t; - typedef struct { // This struct is only used for the convenience of API. int64_t pos; // forward strand 5'-end mapping position int rid; // reference sequence index in bntseq_t; <0 for unmapped diff --git a/bwamem_pair.c b/bwamem_pair.c index 7cbf5e0..5f2c7bb 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -169,7 +169,6 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) { - extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v, u; int r, i, k, y[4], ret; // y[] keeps the last hit kv_init(v); kv_init(u); From 0f88103d2a203a136fac73253eef30bc7311a220 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 23:01:51 -0400 Subject: [PATCH 07/12] SAM almost identical to 0.7.2 --- bwamem.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index f780949..a10448a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -618,20 +618,22 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar) return l; } -void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) { - int i, copy_mate = 0; - mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert + int i; + mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert + if (m_) mtmp = *m_, m = &mtmp; // set flag p->flag |= m? 0x1 : 0; // is paired in sequencing - p->flag |= p->rid < 0? 4 : 0; // is mapped - p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped - if (p->rid < 0 && m && m->rid >= 0) - p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1; + p->flag |= p->rid < 0? 0x4 : 0; // is mapped + p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0; + if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate + m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0; p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand - if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read // print up to CIGAR kputs(s->name, str); kputc('\t', str); // QNAME @@ -649,21 +651,18 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputc('\t', str); // print the mate position if applicable - if (m && m->rid) { + if (m && m->rid >= 0) { if (p->rid == m->rid) kputc('=', str); else kputs(bns->anns[m->rid].name, str); kputc('\t', str); - kputl(m->pos + 1, str); + kputl(m->pos + 1, str); kputc('\t', str); if (p->rid == m->rid) { int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) : 0); int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) : 0); - kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + kputw(m->n_cigar && p->n_cigar? p1 - p0 : 0, str); // compute TLEN if both ends mapped; otherwise, set to zero } else kputc('0', str); - } else if (m && p->rid) { - kputsn("\t=\t", 3, str); - kputl(p->pos + 1, str); - kputsn("\t0\t", 3, str); - } else kputsn("*\t0\t0\t", 6, str); + } else kputsn("*\t0\t0", 5, str); + kputc('\t', str); // print SEQ and QUAL if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL @@ -754,6 +753,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa } else { mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + t.flag |= extra_flag; mem_aln2sam(bns, &str, s, 1, &t, 0, m); } s->sam = str.s; From 6c665189ad628a08ee421ce408d8fc25e7684878 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 23:16:18 -0400 Subject: [PATCH 08/12] r359: identical output to 0.7.2 (without -a) --- bwamem.c | 2 +- bwamem_pair.c | 2 ++ example.c | 2 +- main.c | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index a10448a..71eb436 100644 --- a/bwamem.c +++ b/bwamem.c @@ -837,7 +837,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; - a.score = ar->score; a.sub = ar->sub; + a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); return a; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 5f2c7bb..6316f6a 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -295,6 +295,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; + free(h[0].cigar); free(h[1].cigar); } else goto no_pairing; return n; @@ -306,5 +307,6 @@ no_pairing: } mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + free(h[0].cigar); free(h[1].cigar); return n; } diff --git a/example.c b/example.c index b59eec2..7c25674 100644 --- a/example.c +++ b/example.c @@ -34,7 +34,7 @@ int main(int argc, char *argv[]) if (ar.a[i].secondary >= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment - printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); + printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); printf("\t%d\n", a.NM); // print edit distance diff --git a/main.c b/main.c index b0874f0..e6cb68d 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r351" +#define PACKAGE_VERSION "0.7.2-r359-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From dab5b17c1a6c168e2ada49ccfa495b5e9eacc24a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 23:43:58 -0400 Subject: [PATCH 09/12] r360: output alternative primary alignments in XA --- bwamem.c | 19 ++++++++++++++++++- main.c | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 71eb436..7925dee 100644 --- a/bwamem.c +++ b/bwamem.c @@ -702,6 +702,22 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (n > 1) { // output multiple primary hits + kputsn("\tXA:Z:", 6, str); + for (i = 0; i < n; ++i) { + const mem_aln_t *r = &list[i]; + int k; + if (i == which) continue; + kputs(bns->anns[r->rid].name, str); kputc(',', str); + kputc("+-"[r->is_rev], str); + kputl(r->pos+1, str); kputc(',', str); + for (k = 0; k < r->n_cigar; ++k) { + kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + } + kputc(',', str); + kputw(r->NM, str); kputc(';', str); + } + } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); } @@ -742,7 +758,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - q->flag |= extra_flag; + q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; } diff --git a/main.c b/main.c index e6cb68d..c6d6e29 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r359-beta" +#define PACKAGE_VERSION "0.7.2-r360-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From aa7cdf4bb3e2f6c323f0b45a9528dcea8f694c75 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Mar 2013 00:00:04 -0400 Subject: [PATCH 10/12] r361: flag proper pair even if multi-primary Up to here, all the features in my checklist have been implemented. --- bwamem_pair.c | 6 ++++++ main.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 6316f6a..51a844b 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -305,6 +305,12 @@ no_pairing: h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } + if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + int64_t dist; + int d; + d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); + if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; + } mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); free(h[0].cigar); free(h[1].cigar); diff --git a/main.c b/main.c index c6d6e29..26e6661 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r360-beta" +#define PACKAGE_VERSION "0.7.2-r361-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From c29b176cb6412fc9b2f6fafaa11e7bd4606334ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Mar 2013 00:14:36 -0400 Subject: [PATCH 11/12] r362: bugfix - occasionally wrong TLEN Use the 0.7.2 way to compute TLEN --- bwamem.c | 7 ++++--- main.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7925dee..924c97f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -657,9 +657,10 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputc('\t', str); kputl(m->pos + 1, str); kputc('\t', str); if (p->rid == m->rid) { - int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) : 0); - int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) : 0); - kputw(m->n_cigar && p->n_cigar? p1 - p0 : 0, str); // compute TLEN if both ends mapped; otherwise, set to zero + int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0); + int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0); + if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str); + else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputc('0', str); } else kputsn("*\t0\t0", 5, str); kputc('\t', str); diff --git a/main.c b/main.c index 26e6661..b749dfc 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r361-beta" +#define PACKAGE_VERSION "0.7.2-r362-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From bdf34f6ce7438bcc2d5ca3abce4f29930f93aef4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Mar 2013 09:56:04 -0400 Subject: [PATCH 12/12] r363: XA=>XP; output mapQ in XP In BWA, XA gives hits "shadowed" by the primary hit. In BWA-MEM, we output primary hits only. Primary hits may have non-zero mapping quality. --- bwa.1 | 6 ++++-- bwamem.c | 16 ++++++++++------ main.c | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/bwa.1 b/bwa.1 index 8e90848..f01dce1 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools" +.TH bwa 1 "13 March 2013" "bwa-0.7.3" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -574,11 +574,13 @@ XM Number of mismatches in the alignment XO Number of gap opens XG Number of gap extentions XT Type: Unique/Repeat/N/Mate-sw -XA Alternative hits; format: (chr,pos,CIGAR,NM;)* +XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/ _ XS Suboptimal alignment score XF Support from forward/reverse alignment XE Number of supporting seeds +_ +XP Alt primary hits; format: /(chr,pos,CIGAR;mapQ,NM;)+/ .TE .PP diff --git a/bwamem.c b/bwamem.c index 924c97f..b6a32f7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -703,20 +703,23 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - if (n > 1) { // output multiple primary hits - kputsn("\tXA:Z:", 6, str); + for (i = 0; i < n; ++i) + if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit + if (i < n) { // there are other primary hits; output them + kputsn("\tXP:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; - if (i == which) continue; + if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit kputs(bns->anns[r->rid].name, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputl(r->pos+1, str); kputc(',', str); for (k = 0; k < r->n_cigar; ++k) { kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); } - kputc(',', str); - kputw(r->NM, str); kputc(';', str); + kputc(',', str); kputw(r->mapq, str); + kputc(',', str); kputw(r->NM, str); + kputc(';', str); } } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } @@ -833,7 +836,8 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - a.mapq = mem_approx_mapq_se(opt, ar); + a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; + if (ar->secondary >= 0) a.flag |= 0x20000; bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; diff --git a/main.c b/main.c index b749dfc..c5b84be 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r362-beta" +#define PACKAGE_VERSION "0.7.2-r363-beta" #endif int bwa_fa2pac(int argc, char *argv[]);