prepare to replace the SAM printing code

This move is dangerous as SAM printing is very complex, but it will benefit in
the long run. The planned change will reduce the redundancy, improves clarity
and most importantly makes it much easier to output multiple primary hits in an
optional tag.
This commit is contained in:
Heng Li 2013-03-11 21:25:17 -04:00
parent 5581cb9152
commit 8f0d439913
3 changed files with 124 additions and 7 deletions

106
bwamem.c
View File

@ -714,6 +714,95 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
#undef is_mapped #undef is_mapped
} }
void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m)
{
int i, copy_mate = 0;
mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert
// set flag
p->flag |= m? 0x1 : 0; // is paired in sequencing
p->flag |= p->rid < 0? 4 : 0; // is mapped
p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped
if (p->rid < 0 && m && m->rid >= 0)
p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1;
p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand
p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand
if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read
// print up to CIGAR
kputs(s->name, str); kputc('\t', str); // QNAME
kputw(p->flag, str); kputc('\t', str); // FLAG
if (p->rid >= 0) { // with coordinate
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
kputl(p->pos + 1, str); kputc('\t', str); // POS
kputw(p->mapq, str); kputc('\t', str); // MAPQ
if (p->n_cigar) { // aligned
for (i = 0; i < p->n_cigar; ++i) {
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str);
}
} else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
} else kputsn("*\t0\t0\t*", 7, str); // without coordinte
kputc('\t', str);
// print the mate position if applicable
if (m && m->rid) {
if (p->rid == m->rid) kputc('=', str);
else kputs(bns->anns[m->rid].name, str);
kputc('\t', str);
kputl(m->pos + 1, str);
if (p->rid == m->rid) {
int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5;
int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5;
kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
} else kputc('0', str);
} else if (m && p->rid) {
kputsn("\t=\t", 3, str);
kputl(p->pos + 1, str);
kputsn("\t0\t", 3, str);
} else kputsn("*\t0\t0\t", 6, str);
// print SEQ and QUAL
if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL
kputsn("*\t*", 3, str);
} else if (!p->is_rev) { // the forward strand
int i, qb = 0, qe = s->l_seq;
if (p->n_cigar) {
if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4;
}
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
kputc('\t', str);
if (s->qual) { // printf qual
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
str->s[str->l] = 0;
} else kputc('*', str);
} else { // the reverse strand
int i, qb = 0, qe = s->l_seq;
if (p->n_cigar) {
if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4;
}
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
kputc('\t', str);
if (s->qual) { // printf qual
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
str->s[str->l] = 0;
} else kputc('*', str);
}
// print optional tags
if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); }
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
kputc('\n', str);
}
/************************ /************************
* Integrated interface * * Integrated interface *
************************/ ************************/
@ -816,14 +905,20 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
{ {
mem_aln_t a; mem_aln_t a;
int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; int i, w2, qb, qe, NM, score, is_rev;
int64_t pos, rb = ar->rb, re = ar->re; int64_t pos, rb, re;
uint8_t *query; uint8_t *query;
memset(&a, 0, sizeof(mem_aln_t));
if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record
a.rid = -1; a.pos = -1; a.flag |= 0x4;
return a;
}
qb = ar->qb, qe = ar->qe;
rb = ar->rb, re = ar->re;
query = malloc(l_query); query = malloc(l_query);
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
memset(&a, 0, sizeof(mem_aln_t));
a.mapq = mem_approx_mapq_se(opt, ar); a.mapq = mem_approx_mapq_se(opt, ar);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
@ -839,13 +934,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
if (clip5) { if (clip5) {
memmove(a.cigar+1, a.cigar, a.n_cigar * 4); memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
a.cigar[0] = clip5<<4|3; a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3);
++a.n_cigar; ++a.n_cigar;
} }
if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3);
} }
a.rid = bns_pos2rid(bns, pos); a.rid = bns_pos2rid(bns, pos);
a.pos = pos - bns->anns[a.rid].offset; a.pos = pos - bns->anns[a.rid].offset;
a.r5 = rb; a.score = ar->score; a.sub = ar->sub;
free(query); free(query);
return a; return a;
} }

View File

@ -67,11 +67,15 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of
} bwahit_t; } bwahit_t;
typedef struct { // This struct is only used for the convenience of API. typedef struct { // This struct is only used for the convenience of API.
int rid; // reference sequence index in bntseq_t int64_t pos; // forward strand 5'-end mapping position
int pos; // forward strand 5'-end mapping position int rid; // reference sequence index in bntseq_t; <0 for unmapped
int flag; // extra flag
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
int64_t r5; // position of the 5'-end of read (for computing TLEN)
int score, sub;
} mem_aln_t; } mem_aln_t;
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -89,6 +89,23 @@ static inline int kputuw(unsigned c, kstring_t *s)
return 0; return 0;
} }
static inline int kputl(long c, kstring_t *s)
{
char buf[32];
long l, x;
if (c == 0) return kputc('0', s);
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
if (c < 0) buf[l++] = '-';
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
s->s[s->l] = 0;
return 0;
}
int ksprintf(kstring_t *s, const char *fmt, ...); int ksprintf(kstring_t *s, const char *fmt, ...);
#endif #endif