r235: optionally output tag cs in PAF
cs encodes the query, the reference sequence and CIGAR.
This commit is contained in:
parent
46de0fbdad
commit
4c0713ee14
62
format.c
62
format.c
|
|
@ -1,7 +1,9 @@
|
|||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include "kalloc.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
static inline void str_enlarge(kstring_t *s, int l)
|
||||
|
|
@ -54,6 +56,60 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
|
|||
s->s[s->l] = 0;
|
||||
}
|
||||
|
||||
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
int i, q_off, t_off;
|
||||
uint8_t *qseq, *tseq;
|
||||
char *tmp;
|
||||
if (r->p == 0) return;
|
||||
mm_sprintf_lite(s, "\tcs:Z:");
|
||||
qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
|
||||
tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
|
||||
tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
|
||||
mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
|
||||
if (!r->rev) {
|
||||
for (i = r->qs; i < r->qe; ++i)
|
||||
qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
} else {
|
||||
for (i = r->qs; i < r->qe; ++i)
|
||||
qseq[r->qe - i - 1] = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
}
|
||||
for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) {
|
||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||
assert(op >= 0 && op <= 2);
|
||||
if (op == 0) {
|
||||
int l_tmp = 0;
|
||||
for (j = 0; j < len; ++j) {
|
||||
if (qseq[q_off + j] != tseq[t_off + j]) {
|
||||
if (l_tmp > 0) {
|
||||
tmp[l_tmp] = 0;
|
||||
mm_sprintf_lite(s, "=%s", tmp);
|
||||
}
|
||||
mm_sprintf_lite(s, "*%c%c", "ACGTN"[tseq[t_off + j]], "ACGTN"[qseq[q_off + j]]);
|
||||
} else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]];
|
||||
}
|
||||
if (l_tmp > 0) {
|
||||
tmp[l_tmp] = 0;
|
||||
mm_sprintf_lite(s, "=%s", tmp);
|
||||
}
|
||||
q_off += len, t_off += len;
|
||||
} else if (op == 1) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "ACGTN"[qseq[q_off + j]];
|
||||
mm_sprintf_lite(s, "+%s", tmp);
|
||||
q_off += len;
|
||||
} else if (op == 2) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "ACGTN"[tseq[t_off + j]];
|
||||
mm_sprintf_lite(s, "-%s", tmp);
|
||||
t_off += len;
|
||||
}
|
||||
}
|
||||
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
||||
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
||||
}
|
||||
|
||||
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
||||
{
|
||||
int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S';
|
||||
|
|
@ -63,7 +119,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|||
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
|
||||
}
|
||||
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
|
||||
{
|
||||
s->l = 0;
|
||||
mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
|
||||
|
|
@ -74,12 +130,14 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen);
|
||||
mm_sprintf_lite(s, "\t%d", r->mapq);
|
||||
write_tags(s, r);
|
||||
if (r->p) {
|
||||
if (r->p && (opt_flag & MM_F_OUT_CG)) {
|
||||
uint32_t k;
|
||||
mm_sprintf_lite(s, "\tcg:Z:");
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
|
||||
}
|
||||
if (r->p && (opt_flag & MM_F_OUT_CS))
|
||||
write_cs(km, s, mi, t, r);
|
||||
}
|
||||
|
||||
static char comp_tab[] = {
|
||||
|
|
|
|||
8
main.c
8
main.c
|
|
@ -8,7 +8,7 @@
|
|||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
#define MM_VERSION "2.0rc1-r232"
|
||||
#define MM_VERSION "2.0rc1-r235-dirty"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
|
|
@ -54,7 +54,7 @@ int main(int argc, char *argv[])
|
|||
mm_realtime0 = realtime();
|
||||
mm_mapopt_init(&opt);
|
||||
|
||||
while ((c = getopt_long(argc, argv, "aw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
|
||||
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
|
||||
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
|
||||
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
|
||||
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
|
||||
|
|
@ -67,7 +67,8 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'N') opt.best_n = atoi(optarg);
|
||||
else if (c == 'p') opt.pri_ratio = atof(optarg);
|
||||
else if (c == 'M') opt.mask_level = atof(optarg);
|
||||
else if (c == 'c') opt.flag |= MM_F_CIGAR;
|
||||
else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR;
|
||||
else if (c == 'S') opt.flag |= MM_F_OUT_CS | MM_F_CIGAR;
|
||||
else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR;
|
||||
else if (c == 'Q') opt.flag |= MM_F_NO_QUAL;
|
||||
|
|
@ -166,6 +167,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, " -Q ignore base quality in the input\n");
|
||||
fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
|
||||
fprintf(stderr, " -c output CIGAR in PAF\n");
|
||||
fprintf(stderr, " -S output the cs tag in PAF\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
|
||||
fprintf(stderr, " -K NUM minibatch size [200M]\n");
|
||||
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
|
||||
|
|
|
|||
5
map.c
5
map.c
|
|
@ -338,16 +338,18 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq);
|
||||
return in;
|
||||
} else if (step == 2) { // step 2: output
|
||||
void *km = 0;
|
||||
step_t *s = (step_t*)in;
|
||||
const mm_idx_t *mi = p->mi;
|
||||
for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
|
||||
free(s->buf);
|
||||
if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
mm_bseq1_t *t = &s->seq[i];
|
||||
for (j = 0; j < s->n_reg[i]; ++j) {
|
||||
mm_reg1_t *r = &s->reg[i][j];
|
||||
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
|
||||
else mm_write_paf(&p->str, mi, t, r);
|
||||
else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
|
||||
puts(p->str.s);
|
||||
}
|
||||
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
|
||||
|
|
@ -360,6 +362,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
if (s->seq[i].qual) free(s->seq[i].qual);
|
||||
}
|
||||
free(s->reg); free(s->n_reg); free(s->seq);
|
||||
km_destroy(km);
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
|
||||
free(s);
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@
|
|||
#define MM_F_CIGAR 0x04
|
||||
#define MM_F_OUT_SAM 0x08
|
||||
#define MM_F_NO_QUAL 0x10
|
||||
#define MM_F_OUT_CG 0x20
|
||||
#define MM_F_OUT_CS 0x40
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
|
|
|
|||
2
mmpriv.h
2
mmpriv.h
|
|
@ -40,7 +40,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
|||
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
||||
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
|
|
|||
Loading…
Reference in New Issue