From 8766d286df31aace6d12d965d1c9d052b2b4f5aa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 22 Mar 2018 14:15:33 -0400 Subject: [PATCH] r751: optionally output MD (#118) --- format.c | 88 ++++++++++++++++++++++++++++++++++++++---------------- main.c | 4 ++- minimap.h | 1 + minimap2.1 | 4 +++ 4 files changed, 70 insertions(+), 27 deletions(-) diff --git a/format.c b/format.c index 712e560..382dfd8 100644 --- a/format.c +++ b/format.c @@ -133,31 +133,14 @@ void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int free(str.s); } -static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden) +static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden) { - extern unsigned char seq_nt4_table[256]; int i, q_off, t_off; - uint8_t *qseq, *tseq; - char *tmp; - if (r->p == 0) return; mm_sprintf_lite(s, "\tcs:Z:"); - qseq = (uint8_t*)kmalloc(km, r->qe - r->qs); - tseq = (uint8_t*)kmalloc(km, r->re - r->rs); - tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1); - mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq); - if (!r->rev) { - for (i = r->qs; i < r->qe; ++i) - qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]]; - } else { - for (i = r->qs; i < r->qe; ++i) { - uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]]; - qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c; - } - } for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; assert(op >= 0 && op <= 3); - if (op == 0) { + if (op == 0) { // match int l_tmp = 0; for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { @@ -178,17 +161,17 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ } else mm_sprintf_lite(s, ":%d", l_tmp); } q_off += len, t_off += len; - } else if (op == 1) { + } else if (op == 1) { // insertion to ref for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[qseq[q_off + j]]; mm_sprintf_lite(s, "+%s", tmp); q_off += len; - } else if (op == 2) { + } else if (op == 2) { // deletion from ref for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[tseq[t_off + j]]; mm_sprintf_lite(s, "-%s", tmp); t_off += len; - } else { + } else { // intron assert(len >= 2); mm_sprintf_lite(s, "~%c%c%d%c%c", "acgtn"[tseq[t_off]], "acgtn"[tseq[t_off+1]], len, "acgtn"[tseq[t_off+len-2]], "acgtn"[tseq[t_off+len-1]]); @@ -196,6 +179,59 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ } } assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); +} + +static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp) +{ + int i, q_off, t_off, l_MD = 0; + mm_sprintf_lite(s, "\tMD:Z:"); + for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) { + int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; + assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported + if (op == 0) { // match + for (j = 0; j < len; ++j) { + if (qseq[q_off + j] != tseq[t_off + j]) { + mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]); + l_MD = 0; + } else ++l_MD; + } + q_off += len, t_off += len; + } else if (op == 1) { // insertion to ref + q_off += len; + } else if (op == 2) { // deletion from ref + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "ACGTN"[tseq[t_off + j]]; + mm_sprintf_lite(s, "%d^%s", l_MD, tmp); + l_MD = 0; + t_off += len; + } + } + if (l_MD > 0) mm_sprintf_lite(s, "%d", l_MD); + assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); +} + +static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD) +{ + extern unsigned char seq_nt4_table[256]; + int i; + uint8_t *qseq, *tseq; + char *tmp; + if (r->p == 0) return; + qseq = (uint8_t*)kmalloc(km, r->qe - r->qs); + tseq = (uint8_t*)kmalloc(km, r->re - r->rs); + tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1); + mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq); + if (!r->rev) { + for (i = r->qs; i < r->qe; ++i) + qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]]; + } else { + for (i = r->qs; i < r->qe; ++i) { + uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]]; + qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c; + } + } + if (is_MD) write_MD_core(s, tseq, qseq, r, tmp); + else write_cs_core(s, tseq, qseq, r, tmp, no_iden); kfree(km, qseq); kfree(km, tseq); kfree(km, tmp); } @@ -236,8 +272,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m for (k = 0; k < r->p->n_cigar; ++k) mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); } - if (r->p && (opt_flag & MM_F_OUT_CS)) - write_cs(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG)); + if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) + write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD); } static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) @@ -433,8 +469,8 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se } } } - if (r->p && (opt_flag & MM_F_OUT_CS)) - write_cs(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG)); + if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) + write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD); if (cigar_in_tag) write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag); } diff --git a/main.c b/main.c index b6f75e8..44dae53 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "getopt.h" #endif -#define MM_VERSION "2.9-r750-dirty" +#define MM_VERSION "2.9-r751-dirty" #ifdef __linux__ #include @@ -56,6 +56,7 @@ static struct option long_options[] = { { "dual", required_argument, 0, 0 }, // 26 { "max-clip-ratio", required_argument, 0, 0 }, // 27 { "min-occ-floor", required_argument, 0, 0 }, // 28 + { "MD", no_argument, 0, 0 }, // 29 { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -170,6 +171,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx ==23) opt.flag |= MM_F_REV_ONLY; // --rev-only else if (c == 0 && long_idx ==27) opt.max_clip_ratio = atof(optarg); // --max-clip-ratio else if (c == 0 && long_idx ==28) opt.min_mid_occ = atoi(optarg); // --min-occ-floor + else if (c == 0 && long_idx ==29) opt.flag |= MM_F_OUT_MD; // --MD else if (c == 0 && long_idx == 14) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1); } else if (c == 0 && long_idx == 15) { // --secondary diff --git a/minimap.h b/minimap.h index ae68bfe..399c5c4 100644 --- a/minimap.h +++ b/minimap.h @@ -29,6 +29,7 @@ #define MM_F_REV_ONLY 0x200000 #define MM_F_HEAP_SORT 0x400000 #define MM_F_ALL_CHAINS 0x800000 +#define MM_F_OUT_MD 0x1000000 #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 diff --git a/minimap2.1 b/minimap2.1 index 1a64410..c37301e 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -388,6 +388,9 @@ is given, .I short is assumed. [none] .TP +.B --MD +Output the MD tag (see the SAM spec). +.TP .B -Y In SAM output, use soft clipping for supplementary alignments. .TP @@ -559,6 +562,7 @@ cm i Number of minimizers on the chain s1 i Chaining score s2 i Chaining score of the best secondary chain NM i Total number of mismatches and gaps in the alignment +MD Z To generate the ref sequence in the alignment AS i DP alignment score ms i DP score of the max scoring segment in the alignment nn i Number of ambiguous bases in the alignment