Merge branch 'cs-tag' into dev
This commit is contained in:
commit
68d99f0edb
65
format.c
65
format.c
|
|
@ -1,7 +1,9 @@
|
|||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include "kalloc.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
static inline void str_enlarge(kstring_t *s, int l)
|
||||
|
|
@ -54,6 +56,63 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
|
|||
s->s[s->l] = 0;
|
||||
}
|
||||
|
||||
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
int i, q_off, t_off;
|
||||
uint8_t *qseq, *tseq;
|
||||
char *tmp;
|
||||
if (r->p == 0) return;
|
||||
mm_sprintf_lite(s, "\tcs:Z:");
|
||||
qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
|
||||
tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
|
||||
tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
|
||||
mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
|
||||
if (!r->rev) {
|
||||
for (i = r->qs; i < r->qe; ++i)
|
||||
qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
} else {
|
||||
for (i = r->qs; i < r->qe; ++i) {
|
||||
uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]];
|
||||
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
|
||||
}
|
||||
}
|
||||
for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) {
|
||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||
assert(op >= 0 && op <= 2);
|
||||
if (op == 0) {
|
||||
int l_tmp = 0;
|
||||
for (j = 0; j < len; ++j) {
|
||||
if (qseq[q_off + j] != tseq[t_off + j]) {
|
||||
if (l_tmp > 0) {
|
||||
tmp[l_tmp] = 0;
|
||||
mm_sprintf_lite(s, "=%s", tmp);
|
||||
l_tmp = 0;
|
||||
}
|
||||
mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]);
|
||||
} else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]];
|
||||
}
|
||||
if (l_tmp > 0) {
|
||||
tmp[l_tmp] = 0;
|
||||
mm_sprintf_lite(s, "=%s", tmp);
|
||||
}
|
||||
q_off += len, t_off += len;
|
||||
} else if (op == 1) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "acgtn"[qseq[q_off + j]];
|
||||
mm_sprintf_lite(s, "+%s", tmp);
|
||||
q_off += len;
|
||||
} else if (op == 2) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "acgtn"[tseq[t_off + j]];
|
||||
mm_sprintf_lite(s, "-%s", tmp);
|
||||
t_off += len;
|
||||
}
|
||||
}
|
||||
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
||||
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
||||
}
|
||||
|
||||
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
||||
{
|
||||
int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S';
|
||||
|
|
@ -63,7 +122,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|||
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
|
||||
}
|
||||
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
|
||||
{
|
||||
s->l = 0;
|
||||
mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
|
||||
|
|
@ -74,12 +133,14 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen);
|
||||
mm_sprintf_lite(s, "\t%d", r->mapq);
|
||||
write_tags(s, r);
|
||||
if (r->p) {
|
||||
if (r->p && (opt_flag & MM_F_OUT_CG)) {
|
||||
uint32_t k;
|
||||
mm_sprintf_lite(s, "\tcg:Z:");
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
|
||||
}
|
||||
if (r->p && (opt_flag & MM_F_OUT_CS))
|
||||
write_cs(km, s, mi, t, r);
|
||||
}
|
||||
|
||||
static char comp_tab[] = {
|
||||
|
|
|
|||
8
main.c
8
main.c
|
|
@ -8,7 +8,7 @@
|
|||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
#define MM_VERSION "2.0rc1-r232"
|
||||
#define MM_VERSION "2.0rc1-r238-dirty"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
|
|
@ -54,7 +54,7 @@ int main(int argc, char *argv[])
|
|||
mm_realtime0 = realtime();
|
||||
mm_mapopt_init(&opt);
|
||||
|
||||
while ((c = getopt_long(argc, argv, "aw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
|
||||
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
|
||||
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
|
||||
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
|
||||
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
|
||||
|
|
@ -67,7 +67,8 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'N') opt.best_n = atoi(optarg);
|
||||
else if (c == 'p') opt.pri_ratio = atof(optarg);
|
||||
else if (c == 'M') opt.mask_level = atof(optarg);
|
||||
else if (c == 'c') opt.flag |= MM_F_CIGAR;
|
||||
else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR;
|
||||
else if (c == 'S') opt.flag |= MM_F_OUT_CS | MM_F_CIGAR;
|
||||
else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR;
|
||||
else if (c == 'Q') opt.flag |= MM_F_NO_QUAL;
|
||||
|
|
@ -166,6 +167,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, " -Q ignore base quality in the input\n");
|
||||
fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
|
||||
fprintf(stderr, " -c output CIGAR in PAF\n");
|
||||
fprintf(stderr, " -S output the cs tag in PAF\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
|
||||
fprintf(stderr, " -K NUM minibatch size [200M]\n");
|
||||
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
|
||||
|
|
|
|||
5
map.c
5
map.c
|
|
@ -338,16 +338,18 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq);
|
||||
return in;
|
||||
} else if (step == 2) { // step 2: output
|
||||
void *km = 0;
|
||||
step_t *s = (step_t*)in;
|
||||
const mm_idx_t *mi = p->mi;
|
||||
for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
|
||||
free(s->buf);
|
||||
if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
mm_bseq1_t *t = &s->seq[i];
|
||||
for (j = 0; j < s->n_reg[i]; ++j) {
|
||||
mm_reg1_t *r = &s->reg[i][j];
|
||||
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
|
||||
else mm_write_paf(&p->str, mi, t, r);
|
||||
else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
|
||||
puts(p->str.s);
|
||||
}
|
||||
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
|
||||
|
|
@ -360,6 +362,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
if (s->seq[i].qual) free(s->seq[i].qual);
|
||||
}
|
||||
free(s->reg); free(s->n_reg); free(s->seq);
|
||||
km_destroy(km);
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
|
||||
free(s);
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@
|
|||
#define MM_F_CIGAR 0x04
|
||||
#define MM_F_OUT_SAM 0x08
|
||||
#define MM_F_NO_QUAL 0x10
|
||||
#define MM_F_OUT_CG 0x20
|
||||
#define MM_F_OUT_CS 0x40
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,171 @@
|
|||
var getopt = function(args, ostr) {
|
||||
var oli; // option letter list index
|
||||
if (typeof(getopt.place) == 'undefined')
|
||||
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
||||
if (getopt.place == -1) { // update scanning pointer
|
||||
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
||||
++getopt.ind;
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
||||
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
||||
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
||||
if (getopt.place < 0) ++getopt.ind;
|
||||
return '?';
|
||||
}
|
||||
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
||||
getopt.arg = null;
|
||||
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
||||
} else { // need an argument
|
||||
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
||||
getopt.arg = args[getopt.ind].substr(getopt.place);
|
||||
else if (args.length <= ++getopt.ind) { // no arg
|
||||
getopt.place = -1;
|
||||
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
||||
return '?';
|
||||
} else getopt.arg = args[getopt.ind]; // white space
|
||||
getopt.place = -1;
|
||||
++getopt.ind;
|
||||
}
|
||||
return optopt;
|
||||
}
|
||||
|
||||
var c, maf_out = false, line_len = 80;
|
||||
while ((c = getopt(arguments, "ml:")) != null) {
|
||||
if (c == 'm') maf_out = true;
|
||||
else if (c == 'l') line_len = parseInt(getopt.arg); // TODO: not implemented yet
|
||||
}
|
||||
if (line_len == 0) line_len = 0x7fffffff;
|
||||
|
||||
if (getopt.ind == arguments.length) {
|
||||
print("Usage: k8 paf2aln.js [options] <with-cs.paf>");
|
||||
print("Options:");
|
||||
print(" -m MAF output (BLAST-like output by default)");
|
||||
print(" -l INT line length in BLAST-like output [80]");
|
||||
print("");
|
||||
print("Note: this script only works when minimap2 is run with option '-S'");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
function padding_str(x, len, right)
|
||||
{
|
||||
var s = x.toString();
|
||||
if (s.length < len) {
|
||||
if (right) s += Array(len - s.length + 1).join(" ");
|
||||
else s = Array(len - s.length + 1).join(" ") + s;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
function update_aln(s_ref, s_qry, s_mid, type, seq, slen)
|
||||
{
|
||||
var l = type == '*'? 1 : seq.length;
|
||||
if (type == '=') {
|
||||
s_ref.set(seq);
|
||||
s_qry.set(seq);
|
||||
s_mid.set(Array(l+1).join("|"));
|
||||
slen[0] += l, slen[1] += l;
|
||||
} else if (type == '*') {
|
||||
s_ref.set(seq.charAt(0));
|
||||
s_qry.set(seq.charAt(1));
|
||||
s_mid.set(' ');
|
||||
slen[0] += 1, slen[1] += 1;
|
||||
} else if (type == '+') {
|
||||
s_ref.set(Array(l+1).join("-"));
|
||||
s_qry.set(seq);
|
||||
s_mid.set(Array(l+1).join(" "));
|
||||
slen[1] += l;
|
||||
} else if (type == '-') {
|
||||
s_ref.set(seq);
|
||||
s_qry.set(Array(l+1).join("-"));
|
||||
s_mid.set(Array(l+1).join(" "));
|
||||
slen[0] += l;
|
||||
}
|
||||
}
|
||||
|
||||
function print_aln(rs, qs, strand, slen, elen, s_ref, s_qry, s_mid)
|
||||
{
|
||||
print(["Ref+:", padding_str(rs + slen[0] + 1, 10, false), s_ref.toString(), padding_str(rs + elen[0], 10, true)].join(" "));
|
||||
print(" " + s_mid.toString());
|
||||
var st, en;
|
||||
if (strand == '+') st = qs + slen[1] + 1, en = qs + elen[1];
|
||||
else st = qs - slen[1], en = qs - elen[1] + 1;
|
||||
print(["Qry" + strand + ":", padding_str(st, 10, false), s_qry.toString(), padding_str(en , 10, true)].join(" "));
|
||||
}
|
||||
|
||||
var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes();
|
||||
var re = /([=\-\+\*])([A-Za-z]+)/g;
|
||||
|
||||
var buf = new Bytes();
|
||||
var file = new File(arguments[getopt.ind]);
|
||||
if (maf_out) print("##maf version=1\n");
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, line = buf.toString();
|
||||
var t = line.split("\t", 12);
|
||||
if ((m = /\tcs:Z:(\S+)/.exec(line)) == null) continue;
|
||||
var cs = m[1];
|
||||
s_ref.length = s_qry.length = s_mid.length = 0;
|
||||
var slen = [0, 0], elen = [0, 0];
|
||||
if (maf_out) {
|
||||
while ((m = re.exec(cs)) != null)
|
||||
update_aln(s_ref, s_qry, s_mid, m[1], m[2], elen);
|
||||
if (maf_out) {
|
||||
var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
|
||||
var len = t[0].length > t[5].length? t[0].length : t[5].length;
|
||||
print("a " + score);
|
||||
print(["s", padding_str(t[5], len, true), padding_str(t[7], 10, false), padding_str(parseInt(t[8]) - parseInt(t[7]), 10, false),
|
||||
"+", padding_str(t[6], 10, false), s_ref.toString()].join(" "));
|
||||
var qs, qe, ql = parseInt(t[1]);
|
||||
if (t[4] == '+') {
|
||||
qs = parseInt(t[2]);
|
||||
qe = parseInt(t[3]);
|
||||
} else {
|
||||
qs = ql - parseInt(t[3]);
|
||||
qe = ql - parseInt(t[2]);
|
||||
}
|
||||
print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false),
|
||||
t[4], padding_str(ql, 10, false), s_qry.toString()].join(" "));
|
||||
print("");
|
||||
}
|
||||
} else {
|
||||
line = line.replace(/\tc[sg]:Z:\S+/g, "");
|
||||
print('>' + line);
|
||||
var rs = parseInt(t[7]), qs = t[4] == '+'? parseInt(t[2]) : parseInt(t[3]);
|
||||
var n_blocks = 0;
|
||||
while ((m = re.exec(cs)) != null) {
|
||||
var start = 0, rest = m[1] == '*'? 1 : m[2].length;
|
||||
while (rest > 0) {
|
||||
var l_proc;
|
||||
if (s_ref.length + rest >= line_len) {
|
||||
l_proc = line_len - s_ref.length;
|
||||
update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
|
||||
if (n_blocks > 0) print("");
|
||||
print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
|
||||
++n_blocks;
|
||||
s_ref.length = s_qry.length = s_mid.length = 0;
|
||||
slen[0] = elen[0], slen[1] = elen[1];
|
||||
} else {
|
||||
l_proc = rest;
|
||||
update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
|
||||
}
|
||||
rest -= l_proc, start += l_proc;
|
||||
}
|
||||
}
|
||||
if (s_ref.length > 0) {
|
||||
if (n_blocks > 0) print("");
|
||||
print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
|
||||
++n_blocks;
|
||||
}
|
||||
print("//");
|
||||
}
|
||||
}
|
||||
file.close();
|
||||
buf.destroy();
|
||||
|
||||
s_ref.destroy(); s_qry.destroy(); s_mid.destroy();
|
||||
2
mmpriv.h
2
mmpriv.h
|
|
@ -40,7 +40,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
|||
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
||||
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
|
|
|||
Loading…
Reference in New Issue