diff --git a/align.c b/align.c index 8ecd4aa..fd54a5f 100644 --- a/align.c +++ b/align.c @@ -198,7 +198,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // mm_extra_t *p; if (n_cigar == 0) return; if (r->p == 0) { - uint32_t capacity = n_cigar + sizeof(mm_extra_t); + uint32_t capacity = n_cigar + sizeof(mm_extra_t); // TODO: should this be "n_cigar + sizeof(mm_extra_t)/4" instead? kroundup32(capacity); r->p = (mm_extra_t*)calloc(capacity, 4); r->p->capacity = capacity; @@ -218,6 +218,74 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // } } +static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq) // written by @armintoepfer +{ + int n_diff = 0; + uint32_t k, l, m, cap, toff = 0, qoff = 0, n_M = 0; + mm_extra_t *p; + if (r->p == 0) return; + for (k = 0; k < r->p->n_cigar; ++k) { + uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (op == 0) { + for (l = 0; l < len; ++l) + if (qseq[qoff + l] != tseq[toff + l]) // TODO: N<=>N is converted to "=" + ++n_diff; + ++n_M; + toff += len, qoff += len; + } else if (op == 1) { // insertion + qoff += len; + } else if (op == 2) { // deletion + toff += len; + } else if (op == 3) { // intron + toff += len; + } + } + // update in-place if we can + if (n_diff == 0) { + for (k = 0; k < r->p->n_cigar; ++k) { + uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (op == 0) r->p->cigar[k] = len << 4 | 7; + } + return; + } + // allocate new storage + cap = r->p->n_cigar + (2 * n_diff - n_M) + sizeof(mm_extra_t); + kroundup32(cap); + p = (mm_extra_t*)calloc(cap, 4); + memcpy(p, r->p, sizeof(mm_extra_t)); + p->capacity = cap; + // update cigar while copying + toff = qoff = m = 0; + for (k = 0; k < r->p->n_cigar; ++k) { + uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (op == 0) { // match/mismatch + while (len > 0) { + // match + for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} + if (l > 0) p->cigar[m++] = l << 4 | 7; + len -= l; + toff += l, qoff += l; + // mismatch + for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} + if (l > 0) p->cigar[m++] = l << 4 | 8; + len -= l; + toff += l, qoff += l; + } + continue; + } else if (op == 1) { // insertion + qoff += len; + } else if (op == 2) { // deletion + toff += len; + } else if (op == 3) { // intron + toff += len; + } + p->cigar[m++] = r->p->cigar[k]; + } + p->n_cigar = m; + free(r->p); + r->p = p; +} + static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez) { if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { @@ -675,6 +743,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); mm_update_extra(r, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); + if (opt->flag & MM_F_EQX) mm_update_cigar_eqx(r, &qseq0[r->rev][qs1], tseq); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -733,6 +802,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i r_inv->rs = r1->re + t_off; r_inv->re = r_inv->rs + ez->max_t + 1; mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e); + if (opt->flag & MM_F_EQX) mm_update_cigar_eqx(r_inv, &qseq[q_off], &tseq[t_off]); ret = 1; end_align1_inv: kfree(km, tseq); diff --git a/format.c b/format.c index 515a6b0..f5109ea 100644 --- a/format.c +++ b/format.c @@ -270,7 +270,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); } if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD); @@ -321,7 +321,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 'H' : 'S'; if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char); for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char); } } diff --git a/main.c b/main.c index e11eebd..0fe7389 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "getopt.h" #endif -#define MM_VERSION "2.10-r783-dirty" +#define MM_VERSION "2.10-r784-dirty" #ifdef __linux__ #include @@ -59,6 +59,7 @@ static struct option long_options[] = { { "MD", no_argument, 0, 0 }, // 29 { "lj-min-ratio", required_argument, 0, 0 }, // 30 { "score-N", required_argument, 0, 0 }, // 31 + { "eqx", no_argument, 0, 0 }, // 32 { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -177,6 +178,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx ==29) opt.flag |= MM_F_OUT_MD; // --MD else if (c == 0 && long_idx ==30) opt.min_join_flank_ratio = atof(optarg); // --lj-min-ratio else if (c == 0 && long_idx ==31) opt.sc_ambi = atoi(optarg); // --score-N + else if (c == 0 && long_idx ==32) opt.flag |= MM_F_EQX; // --eqx else if (c == 0 && long_idx == 14) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1); } else if (c == 0 && long_idx == 15) { // --secondary @@ -278,6 +280,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -c output CIGAR in PAF\n"); fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n"); fprintf(fp_help, " --MD output the MD tag\n"); + fprintf(fp_help, " --eqx write =/X CIGAR operators\n"); fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n"); fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n"); diff --git a/minimap.h b/minimap.h index 65f49c3..6f1b701 100644 --- a/minimap.h +++ b/minimap.h @@ -31,6 +31,7 @@ #define MM_F_ALL_CHAINS 0x800000 #define MM_F_OUT_MD 0x1000000 #define MM_F_COPY_COMMENT 0x2000000 +#define MM_F_EQX 0x4000000 // use =/X instead of M #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 diff --git a/minimap2.1 b/minimap2.1 index 5856d25..29777d7 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -397,6 +397,9 @@ is assumed. [none] .B --MD Output the MD tag (see the SAM spec). .TP +.B --eqx +Output =/X CIGAR operators for sequence match/mismatch. +.TP .B -Y In SAM output, use soft clipping for supplementary alignments. .TP