r784: support the =/X CIGAR operators (#156)

This commit is contained in:
Heng Li 2018-05-30 16:11:22 -04:00
parent a3afeec0b2
commit 154d2caf5b
5 changed files with 81 additions and 4 deletions

72
align.c
View File

@ -198,7 +198,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
mm_extra_t *p;
if (n_cigar == 0) return;
if (r->p == 0) {
uint32_t capacity = n_cigar + sizeof(mm_extra_t);
uint32_t capacity = n_cigar + sizeof(mm_extra_t); // TODO: should this be "n_cigar + sizeof(mm_extra_t)/4" instead?
kroundup32(capacity);
r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity;
@ -218,6 +218,74 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
}
}
static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq) // written by @armintoepfer
{
int n_diff = 0;
uint32_t k, l, m, cap, toff = 0, qoff = 0, n_M = 0;
mm_extra_t *p;
if (r->p == 0) return;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) {
for (l = 0; l < len; ++l)
if (qseq[qoff + l] != tseq[toff + l]) // TODO: N<=>N is converted to "="
++n_diff;
++n_M;
toff += len, qoff += len;
} else if (op == 1) { // insertion
qoff += len;
} else if (op == 2) { // deletion
toff += len;
} else if (op == 3) { // intron
toff += len;
}
}
// update in-place if we can
if (n_diff == 0) {
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) r->p->cigar[k] = len << 4 | 7;
}
return;
}
// allocate new storage
cap = r->p->n_cigar + (2 * n_diff - n_M) + sizeof(mm_extra_t);
kroundup32(cap);
p = (mm_extra_t*)calloc(cap, 4);
memcpy(p, r->p, sizeof(mm_extra_t));
p->capacity = cap;
// update cigar while copying
toff = qoff = m = 0;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) { // match/mismatch
while (len > 0) {
// match
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
if (l > 0) p->cigar[m++] = l << 4 | 7;
len -= l;
toff += l, qoff += l;
// mismatch
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
if (l > 0) p->cigar[m++] = l << 4 | 8;
len -= l;
toff += l, qoff += l;
}
continue;
} else if (op == 1) { // insertion
qoff += len;
} else if (op == 2) { // deletion
toff += len;
} else if (op == 3) { // intron
toff += len;
}
p->cigar[m++] = r->p->cigar[k];
}
p->n_cigar = m;
free(r->p);
r->p = p;
}
static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez)
{
if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
@ -675,6 +743,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (r->p) {
mm_idx_getseq(mi, rid, rs1, re1, tseq);
mm_update_extra(r, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
if (opt->flag & MM_F_EQX) mm_update_cigar_eqx(r, &qseq0[r->rev][qs1], tseq);
if (rev && r->p->trans_strand)
r->p->trans_strand ^= 3; // flip to the read strand
}
@ -733,6 +802,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
r_inv->rs = r1->re + t_off;
r_inv->re = r_inv->rs + ez->max_t + 1;
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e);
if (opt->flag & MM_F_EQX) mm_update_cigar_eqx(r_inv, &qseq[q_off], &tseq[t_off]);
ret = 1;
end_align1_inv:
kfree(km, tseq);

View File

@ -270,7 +270,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
uint32_t k;
mm_sprintf_lite(s, "\tcg:Z:");
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
@ -321,7 +321,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co
int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 'H' : 'S';
if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
}
}

5
main.c
View File

@ -10,7 +10,7 @@
#include "getopt.h"
#endif
#define MM_VERSION "2.10-r783-dirty"
#define MM_VERSION "2.10-r784-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -59,6 +59,7 @@ static struct option long_options[] = {
{ "MD", no_argument, 0, 0 }, // 29
{ "lj-min-ratio", required_argument, 0, 0 }, // 30
{ "score-N", required_argument, 0, 0 }, // 31
{ "eqx", no_argument, 0, 0 }, // 32
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -177,6 +178,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==29) opt.flag |= MM_F_OUT_MD; // --MD
else if (c == 0 && long_idx ==30) opt.min_join_flank_ratio = atof(optarg); // --lj-min-ratio
else if (c == 0 && long_idx ==31) opt.sc_ambi = atoi(optarg); // --score-N
else if (c == 0 && long_idx ==32) opt.flag |= MM_F_EQX; // --eqx
else if (c == 0 && long_idx == 14) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1);
} else if (c == 0 && long_idx == 15) { // --secondary
@ -278,6 +280,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -c output CIGAR in PAF\n");
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
fprintf(fp_help, " --MD output the MD tag\n");
fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n");

View File

@ -31,6 +31,7 @@
#define MM_F_ALL_CHAINS 0x800000
#define MM_F_OUT_MD 0x1000000
#define MM_F_COPY_COMMENT 0x2000000
#define MM_F_EQX 0x4000000 // use =/X instead of M
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2

View File

@ -397,6 +397,9 @@ is assumed. [none]
.B --MD
Output the MD tag (see the SAM spec).
.TP
.B --eqx
Output =/X CIGAR operators for sequence match/mismatch.
.TP
.B -Y
In SAM output, use soft clipping for supplementary alignments.
.TP