r751: optionally output MD (#118)
This commit is contained in:
parent
623b5d9d48
commit
8766d286df
88
format.c
88
format.c
|
|
@ -133,31 +133,14 @@ void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int
|
|||
free(str.s);
|
||||
}
|
||||
|
||||
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden)
|
||||
static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
int i, q_off, t_off;
|
||||
uint8_t *qseq, *tseq;
|
||||
char *tmp;
|
||||
if (r->p == 0) return;
|
||||
mm_sprintf_lite(s, "\tcs:Z:");
|
||||
qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
|
||||
tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
|
||||
tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
|
||||
mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
|
||||
if (!r->rev) {
|
||||
for (i = r->qs; i < r->qe; ++i)
|
||||
qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
} else {
|
||||
for (i = r->qs; i < r->qe; ++i) {
|
||||
uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
|
||||
}
|
||||
}
|
||||
for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) {
|
||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||
assert(op >= 0 && op <= 3);
|
||||
if (op == 0) {
|
||||
if (op == 0) { // match
|
||||
int l_tmp = 0;
|
||||
for (j = 0; j < len; ++j) {
|
||||
if (qseq[q_off + j] != tseq[t_off + j]) {
|
||||
|
|
@ -178,17 +161,17 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_
|
|||
} else mm_sprintf_lite(s, ":%d", l_tmp);
|
||||
}
|
||||
q_off += len, t_off += len;
|
||||
} else if (op == 1) {
|
||||
} else if (op == 1) { // insertion to ref
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "acgtn"[qseq[q_off + j]];
|
||||
mm_sprintf_lite(s, "+%s", tmp);
|
||||
q_off += len;
|
||||
} else if (op == 2) {
|
||||
} else if (op == 2) { // deletion from ref
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "acgtn"[tseq[t_off + j]];
|
||||
mm_sprintf_lite(s, "-%s", tmp);
|
||||
t_off += len;
|
||||
} else {
|
||||
} else { // intron
|
||||
assert(len >= 2);
|
||||
mm_sprintf_lite(s, "~%c%c%d%c%c", "acgtn"[tseq[t_off]], "acgtn"[tseq[t_off+1]],
|
||||
len, "acgtn"[tseq[t_off+len-2]], "acgtn"[tseq[t_off+len-1]]);
|
||||
|
|
@ -196,6 +179,59 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_
|
|||
}
|
||||
}
|
||||
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
||||
}
|
||||
|
||||
static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
|
||||
{
|
||||
int i, q_off, t_off, l_MD = 0;
|
||||
mm_sprintf_lite(s, "\tMD:Z:");
|
||||
for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) {
|
||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||
assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported
|
||||
if (op == 0) { // match
|
||||
for (j = 0; j < len; ++j) {
|
||||
if (qseq[q_off + j] != tseq[t_off + j]) {
|
||||
mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
|
||||
l_MD = 0;
|
||||
} else ++l_MD;
|
||||
}
|
||||
q_off += len, t_off += len;
|
||||
} else if (op == 1) { // insertion to ref
|
||||
q_off += len;
|
||||
} else if (op == 2) { // deletion from ref
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "ACGTN"[tseq[t_off + j]];
|
||||
mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
|
||||
l_MD = 0;
|
||||
t_off += len;
|
||||
}
|
||||
}
|
||||
if (l_MD > 0) mm_sprintf_lite(s, "%d", l_MD);
|
||||
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
||||
}
|
||||
|
||||
static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
int i;
|
||||
uint8_t *qseq, *tseq;
|
||||
char *tmp;
|
||||
if (r->p == 0) return;
|
||||
qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
|
||||
tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
|
||||
tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
|
||||
mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
|
||||
if (!r->rev) {
|
||||
for (i = r->qs; i < r->qe; ++i)
|
||||
qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
} else {
|
||||
for (i = r->qs; i < r->qe; ++i) {
|
||||
uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
|
||||
}
|
||||
}
|
||||
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp);
|
||||
else write_cs_core(s, tseq, qseq, r, tmp, no_iden);
|
||||
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
||||
}
|
||||
|
||||
|
|
@ -236,8 +272,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
|
||||
}
|
||||
if (r->p && (opt_flag & MM_F_OUT_CS))
|
||||
write_cs(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG));
|
||||
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
||||
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
|
||||
}
|
||||
|
||||
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
|
||||
|
|
@ -433,8 +469,8 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|||
}
|
||||
}
|
||||
}
|
||||
if (r->p && (opt_flag & MM_F_OUT_CS))
|
||||
write_cs(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG));
|
||||
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
||||
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
|
||||
if (cigar_in_tag)
|
||||
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
||||
}
|
||||
|
|
|
|||
4
main.c
4
main.c
|
|
@ -10,7 +10,7 @@
|
|||
#include "getopt.h"
|
||||
#endif
|
||||
|
||||
#define MM_VERSION "2.9-r750-dirty"
|
||||
#define MM_VERSION "2.9-r751-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -56,6 +56,7 @@ static struct option long_options[] = {
|
|||
{ "dual", required_argument, 0, 0 }, // 26
|
||||
{ "max-clip-ratio", required_argument, 0, 0 }, // 27
|
||||
{ "min-occ-floor", required_argument, 0, 0 }, // 28
|
||||
{ "MD", no_argument, 0, 0 }, // 29
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "max-intron-len", required_argument, 0, 'G' },
|
||||
{ "version", no_argument, 0, 'V' },
|
||||
|
|
@ -170,6 +171,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx ==23) opt.flag |= MM_F_REV_ONLY; // --rev-only
|
||||
else if (c == 0 && long_idx ==27) opt.max_clip_ratio = atof(optarg); // --max-clip-ratio
|
||||
else if (c == 0 && long_idx ==28) opt.min_mid_occ = atoi(optarg); // --min-occ-floor
|
||||
else if (c == 0 && long_idx ==29) opt.flag |= MM_F_OUT_MD; // --MD
|
||||
else if (c == 0 && long_idx == 14) { // --frag
|
||||
yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1);
|
||||
} else if (c == 0 && long_idx == 15) { // --secondary
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@
|
|||
#define MM_F_REV_ONLY 0x200000
|
||||
#define MM_F_HEAP_SORT 0x400000
|
||||
#define MM_F_ALL_CHAINS 0x800000
|
||||
#define MM_F_OUT_MD 0x1000000
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
|
|
|
|||
|
|
@ -388,6 +388,9 @@ is given,
|
|||
.I short
|
||||
is assumed. [none]
|
||||
.TP
|
||||
.B --MD
|
||||
Output the MD tag (see the SAM spec).
|
||||
.TP
|
||||
.B -Y
|
||||
In SAM output, use soft clipping for supplementary alignments.
|
||||
.TP
|
||||
|
|
@ -559,6 +562,7 @@ cm i Number of minimizers on the chain
|
|||
s1 i Chaining score
|
||||
s2 i Chaining score of the best secondary chain
|
||||
NM i Total number of mismatches and gaps in the alignment
|
||||
MD Z To generate the ref sequence in the alignment
|
||||
AS i DP alignment score
|
||||
ms i DP score of the max scoring segment in the alignment
|
||||
nn i Number of ambiguous bases in the alignment
|
||||
|
|
|
|||
Loading…
Reference in New Issue