prepare to replace the SAM printing code
This move is dangerous as SAM printing is very complex, but it will benefit in the long run. The planned change will reduce the redundancy, improves clarity and most importantly makes it much easier to output multiple primary hits in an optional tag.
This commit is contained in:
parent
5581cb9152
commit
8f0d439913
106
bwamem.c
106
bwamem.c
|
|
@ -714,6 +714,95 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
|
||||||
#undef is_mapped
|
#undef is_mapped
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m)
|
||||||
|
{
|
||||||
|
int i, copy_mate = 0;
|
||||||
|
mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert
|
||||||
|
|
||||||
|
// set flag
|
||||||
|
p->flag |= m? 0x1 : 0; // is paired in sequencing
|
||||||
|
p->flag |= p->rid < 0? 4 : 0; // is mapped
|
||||||
|
p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped
|
||||||
|
if (p->rid < 0 && m && m->rid >= 0)
|
||||||
|
p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1;
|
||||||
|
p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand
|
||||||
|
p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand
|
||||||
|
if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read
|
||||||
|
|
||||||
|
// print up to CIGAR
|
||||||
|
kputs(s->name, str); kputc('\t', str); // QNAME
|
||||||
|
kputw(p->flag, str); kputc('\t', str); // FLAG
|
||||||
|
if (p->rid >= 0) { // with coordinate
|
||||||
|
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
|
||||||
|
kputl(p->pos + 1, str); kputc('\t', str); // POS
|
||||||
|
kputw(p->mapq, str); kputc('\t', str); // MAPQ
|
||||||
|
if (p->n_cigar) { // aligned
|
||||||
|
for (i = 0; i < p->n_cigar; ++i) {
|
||||||
|
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str);
|
||||||
|
}
|
||||||
|
} else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
|
||||||
|
} else kputsn("*\t0\t0\t*", 7, str); // without coordinte
|
||||||
|
kputc('\t', str);
|
||||||
|
|
||||||
|
// print the mate position if applicable
|
||||||
|
if (m && m->rid) {
|
||||||
|
if (p->rid == m->rid) kputc('=', str);
|
||||||
|
else kputs(bns->anns[m->rid].name, str);
|
||||||
|
kputc('\t', str);
|
||||||
|
kputl(m->pos + 1, str);
|
||||||
|
if (p->rid == m->rid) {
|
||||||
|
int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5;
|
||||||
|
int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5;
|
||||||
|
kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
|
||||||
|
} else kputc('0', str);
|
||||||
|
} else if (m && p->rid) {
|
||||||
|
kputsn("\t=\t", 3, str);
|
||||||
|
kputl(p->pos + 1, str);
|
||||||
|
kputsn("\t0\t", 3, str);
|
||||||
|
} else kputsn("*\t0\t0\t", 6, str);
|
||||||
|
|
||||||
|
// print SEQ and QUAL
|
||||||
|
if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL
|
||||||
|
kputsn("*\t*", 3, str);
|
||||||
|
} else if (!p->is_rev) { // the forward strand
|
||||||
|
int i, qb = 0, qe = s->l_seq;
|
||||||
|
if (p->n_cigar) {
|
||||||
|
if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4;
|
||||||
|
if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4;
|
||||||
|
}
|
||||||
|
ks_resize(str, str->l + (qe - qb) + 1);
|
||||||
|
for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
|
||||||
|
kputc('\t', str);
|
||||||
|
if (s->qual) { // printf qual
|
||||||
|
ks_resize(str, str->l + (qe - qb) + 1);
|
||||||
|
for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
|
||||||
|
str->s[str->l] = 0;
|
||||||
|
} else kputc('*', str);
|
||||||
|
} else { // the reverse strand
|
||||||
|
int i, qb = 0, qe = s->l_seq;
|
||||||
|
if (p->n_cigar) {
|
||||||
|
if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4;
|
||||||
|
if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4;
|
||||||
|
}
|
||||||
|
ks_resize(str, str->l + (qe - qb) + 1);
|
||||||
|
for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
|
||||||
|
kputc('\t', str);
|
||||||
|
if (s->qual) { // printf qual
|
||||||
|
ks_resize(str, str->l + (qe - qb) + 1);
|
||||||
|
for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
|
||||||
|
str->s[str->l] = 0;
|
||||||
|
} else kputc('*', str);
|
||||||
|
}
|
||||||
|
|
||||||
|
// print optional tags
|
||||||
|
if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); }
|
||||||
|
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
||||||
|
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
||||||
|
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
|
||||||
|
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||||||
|
kputc('\n', str);
|
||||||
|
}
|
||||||
|
|
||||||
/************************
|
/************************
|
||||||
* Integrated interface *
|
* Integrated interface *
|
||||||
************************/
|
************************/
|
||||||
|
|
@ -816,14 +905,20 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
|
||||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||||||
{
|
{
|
||||||
mem_aln_t a;
|
mem_aln_t a;
|
||||||
int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
|
int i, w2, qb, qe, NM, score, is_rev;
|
||||||
int64_t pos, rb = ar->rb, re = ar->re;
|
int64_t pos, rb, re;
|
||||||
uint8_t *query;
|
uint8_t *query;
|
||||||
|
|
||||||
|
memset(&a, 0, sizeof(mem_aln_t));
|
||||||
|
if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record
|
||||||
|
a.rid = -1; a.pos = -1; a.flag |= 0x4;
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
qb = ar->qb, qe = ar->qe;
|
||||||
|
rb = ar->rb, re = ar->re;
|
||||||
query = malloc(l_query);
|
query = malloc(l_query);
|
||||||
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
|
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
|
||||||
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
|
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
|
||||||
memset(&a, 0, sizeof(mem_aln_t));
|
|
||||||
a.mapq = mem_approx_mapq_se(opt, ar);
|
a.mapq = mem_approx_mapq_se(opt, ar);
|
||||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
||||||
w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
|
w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
|
||||||
|
|
@ -839,13 +934,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
||||||
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
|
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
|
||||||
if (clip5) {
|
if (clip5) {
|
||||||
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
||||||
a.cigar[0] = clip5<<4|3;
|
a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3);
|
||||||
++a.n_cigar;
|
++a.n_cigar;
|
||||||
}
|
}
|
||||||
if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
|
if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3);
|
||||||
}
|
}
|
||||||
a.rid = bns_pos2rid(bns, pos);
|
a.rid = bns_pos2rid(bns, pos);
|
||||||
a.pos = pos - bns->anns[a.rid].offset;
|
a.pos = pos - bns->anns[a.rid].offset;
|
||||||
|
a.r5 = rb; a.score = ar->score; a.sub = ar->sub;
|
||||||
free(query);
|
free(query);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
8
bwamem.h
8
bwamem.h
|
|
@ -67,11 +67,15 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of
|
||||||
} bwahit_t;
|
} bwahit_t;
|
||||||
|
|
||||||
typedef struct { // This struct is only used for the convenience of API.
|
typedef struct { // This struct is only used for the convenience of API.
|
||||||
int rid; // reference sequence index in bntseq_t
|
int64_t pos; // forward strand 5'-end mapping position
|
||||||
int pos; // forward strand 5'-end mapping position
|
int rid; // reference sequence index in bntseq_t; <0 for unmapped
|
||||||
|
int flag; // extra flag
|
||||||
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
||||||
int n_cigar; // number of CIGAR operations
|
int n_cigar; // number of CIGAR operations
|
||||||
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
||||||
|
|
||||||
|
int64_t r5; // position of the 5'-end of read (for computing TLEN)
|
||||||
|
int score, sub;
|
||||||
} mem_aln_t;
|
} mem_aln_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
||||||
17
kstring.h
17
kstring.h
|
|
@ -89,6 +89,23 @@ static inline int kputuw(unsigned c, kstring_t *s)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int kputl(long c, kstring_t *s)
|
||||||
|
{
|
||||||
|
char buf[32];
|
||||||
|
long l, x;
|
||||||
|
if (c == 0) return kputc('0', s);
|
||||||
|
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
|
||||||
|
if (c < 0) buf[l++] = '-';
|
||||||
|
if (s->l + l + 1 >= s->m) {
|
||||||
|
s->m = s->l + l + 2;
|
||||||
|
kroundup32(s->m);
|
||||||
|
s->s = (char*)realloc(s->s, s->m);
|
||||||
|
}
|
||||||
|
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
|
||||||
|
s->s[s->l] = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int ksprintf(kstring_t *s, const char *fmt, ...);
|
int ksprintf(kstring_t *s, const char *fmt, ...);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue