diff --git a/bwamem.c b/bwamem.c index 9350943..e1b6205 100644 --- a/bwamem.c +++ b/bwamem.c @@ -714,6 +714,95 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #undef is_mapped } +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) +{ + int i, copy_mate = 0; + mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert + + // set flag + p->flag |= m? 0x1 : 0; // is paired in sequencing + p->flag |= p->rid < 0? 4 : 0; // is mapped + p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1; + p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand + p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand + if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read + + // print up to CIGAR + kputs(s->name, str); kputc('\t', str); // QNAME + kputw(p->flag, str); kputc('\t', str); // FLAG + if (p->rid >= 0) { // with coordinate + kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME + kputl(p->pos + 1, str); kputc('\t', str); // POS + kputw(p->mapq, str); kputc('\t', str); // MAPQ + if (p->n_cigar) { // aligned + for (i = 0; i < p->n_cigar; ++i) { + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str); + } + } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) + } else kputsn("*\t0\t0\t*", 7, str); // without coordinte + kputc('\t', str); + + // print the mate position if applicable + if (m && m->rid) { + if (p->rid == m->rid) kputc('=', str); + else kputs(bns->anns[m->rid].name, str); + kputc('\t', str); + kputl(m->pos + 1, str); + if (p->rid == m->rid) { + int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5; + int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5; + kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + } else kputc('0', str); + } else if (m && p->rid) { + kputsn("\t=\t", 3, str); + kputl(p->pos + 1, str); + kputsn("\t0\t", 3, str); + } else kputsn("*\t0\t0\t", 6, str); + + // print SEQ and QUAL + if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!p->is_rev) { // the forward strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + + // print optional tags + if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + kputc('\n', str); +} + /************************ * Integrated interface * ************************/ @@ -816,14 +905,20 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; - int64_t pos, rb = ar->rb, re = ar->re; + int i, w2, qb, qe, NM, score, is_rev; + int64_t pos, rb, re; uint8_t *query; + memset(&a, 0, sizeof(mem_aln_t)); + if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; + } + qb = ar->qb, qe = ar->qe; + rb = ar->rb, re = ar->re; query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); @@ -839,13 +934,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4); - a.cigar[0] = clip5<<4|3; + a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; + if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; + a.r5 = rb; a.score = ar->score; a.sub = ar->sub; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index 14f04d5..dce10b3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -67,11 +67,15 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of } bwahit_t; typedef struct { // This struct is only used for the convenience of API. - int rid; // reference sequence index in bntseq_t - int pos; // forward strand 5'-end mapping position + int64_t pos; // forward strand 5'-end mapping position + int rid; // reference sequence index in bntseq_t; <0 for unmapped + int flag; // extra flag uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + + int64_t r5; // position of the 5'-end of read (for computing TLEN) + int score, sub; } mem_aln_t; #ifdef __cplusplus diff --git a/kstring.h b/kstring.h index 81d7d60..04f1c42 100644 --- a/kstring.h +++ b/kstring.h @@ -89,6 +89,23 @@ static inline int kputuw(unsigned c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif