Merge branch 'cs-tag' into dev

This commit is contained in:
Heng Li 2017-08-04 14:31:46 -04:00
commit 68d99f0edb
6 changed files with 246 additions and 7 deletions

View File

@ -1,7 +1,9 @@
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>
#include "kalloc.h"
#include "mmpriv.h"
static inline void str_enlarge(kstring_t *s, int l)
@ -54,6 +56,63 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
s->s[s->l] = 0;
}
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
{
extern unsigned char seq_nt4_table[256];
int i, q_off, t_off;
uint8_t *qseq, *tseq;
char *tmp;
if (r->p == 0) return;
mm_sprintf_lite(s, "\tcs:Z:");
qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
if (!r->rev) {
for (i = r->qs; i < r->qe; ++i)
qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
} else {
for (i = r->qs; i < r->qe; ++i) {
uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]];
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
}
}
for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert(op >= 0 && op <= 2);
if (op == 0) {
int l_tmp = 0;
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
if (l_tmp > 0) {
tmp[l_tmp] = 0;
mm_sprintf_lite(s, "=%s", tmp);
l_tmp = 0;
}
mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]);
} else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]];
}
if (l_tmp > 0) {
tmp[l_tmp] = 0;
mm_sprintf_lite(s, "=%s", tmp);
}
q_off += len, t_off += len;
} else if (op == 1) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[qseq[q_off + j]];
mm_sprintf_lite(s, "+%s", tmp);
q_off += len;
} else if (op == 2) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[tseq[t_off + j]];
mm_sprintf_lite(s, "-%s", tmp);
t_off += len;
}
}
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
}
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
{
int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S';
@ -63,7 +122,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
}
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
{
s->l = 0;
mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
@ -74,12 +133,14 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen);
mm_sprintf_lite(s, "\t%d", r->mapq);
write_tags(s, r);
if (r->p) {
if (r->p && (opt_flag & MM_F_OUT_CG)) {
uint32_t k;
mm_sprintf_lite(s, "\tcg:Z:");
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & MM_F_OUT_CS))
write_cs(km, s, mi, t, r);
}
static char comp_tab[] = {

8
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0rc1-r232"
#define MM_VERSION "2.0rc1-r238-dirty"
void liftrlimit()
{
@ -54,7 +54,7 @@ int main(int argc, char *argv[])
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "aw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
@ -67,7 +67,8 @@ int main(int argc, char *argv[])
else if (c == 'N') opt.best_n = atoi(optarg);
else if (c == 'p') opt.pri_ratio = atof(optarg);
else if (c == 'M') opt.mask_level = atof(optarg);
else if (c == 'c') opt.flag |= MM_F_CIGAR;
else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR;
else if (c == 'S') opt.flag |= MM_F_OUT_CS | MM_F_CIGAR;
else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR;
else if (c == 'Q') opt.flag |= MM_F_NO_QUAL;
@ -166,6 +167,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " -Q ignore base quality in the input\n");
fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
fprintf(stderr, " -c output CIGAR in PAF\n");
fprintf(stderr, " -S output the cs tag in PAF\n");
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
fprintf(stderr, " -K NUM minibatch size [200M]\n");
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);

5
map.c
View File

@ -338,16 +338,18 @@ static void *worker_pipeline(void *shared, int step, void *in)
kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq);
return in;
} else if (step == 2) { // step 2: output
void *km = 0;
step_t *s = (step_t*)in;
const mm_idx_t *mi = p->mi;
for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
free(s->buf);
if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
for (i = 0; i < s->n_seq; ++i) {
mm_bseq1_t *t = &s->seq[i];
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
else mm_write_paf(&p->str, mi, t, r);
else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
puts(p->str.s);
}
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
@ -360,6 +362,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
if (s->seq[i].qual) free(s->seq[i].qual);
}
free(s->reg); free(s->n_reg); free(s->seq);
km_destroy(km);
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
free(s);

View File

@ -12,6 +12,8 @@
#define MM_F_CIGAR 0x04
#define MM_F_OUT_SAM 0x08
#define MM_F_NO_QUAL 0x10
#define MM_F_OUT_CG 0x20
#define MM_F_OUT_CS 0x40
#define MM_IDX_MAGIC "MMI\2"

171
misc/paf2aln.js 100644
View File

@ -0,0 +1,171 @@
var getopt = function(args, ostr) {
var oli; // option letter list index
if (typeof(getopt.place) == 'undefined')
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
if (getopt.place == -1) { // update scanning pointer
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
getopt.place = -1;
return null;
}
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
++getopt.ind;
getopt.place = -1;
return null;
}
}
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
if (getopt.place < 0) ++getopt.ind;
return '?';
}
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
getopt.arg = null;
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
} else { // need an argument
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
getopt.arg = args[getopt.ind].substr(getopt.place);
else if (args.length <= ++getopt.ind) { // no arg
getopt.place = -1;
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
return '?';
} else getopt.arg = args[getopt.ind]; // white space
getopt.place = -1;
++getopt.ind;
}
return optopt;
}
var c, maf_out = false, line_len = 80;
while ((c = getopt(arguments, "ml:")) != null) {
if (c == 'm') maf_out = true;
else if (c == 'l') line_len = parseInt(getopt.arg); // TODO: not implemented yet
}
if (line_len == 0) line_len = 0x7fffffff;
if (getopt.ind == arguments.length) {
print("Usage: k8 paf2aln.js [options] <with-cs.paf>");
print("Options:");
print(" -m MAF output (BLAST-like output by default)");
print(" -l INT line length in BLAST-like output [80]");
print("");
print("Note: this script only works when minimap2 is run with option '-S'");
exit(1);
}
function padding_str(x, len, right)
{
var s = x.toString();
if (s.length < len) {
if (right) s += Array(len - s.length + 1).join(" ");
else s = Array(len - s.length + 1).join(" ") + s;
}
return s;
}
function update_aln(s_ref, s_qry, s_mid, type, seq, slen)
{
var l = type == '*'? 1 : seq.length;
if (type == '=') {
s_ref.set(seq);
s_qry.set(seq);
s_mid.set(Array(l+1).join("|"));
slen[0] += l, slen[1] += l;
} else if (type == '*') {
s_ref.set(seq.charAt(0));
s_qry.set(seq.charAt(1));
s_mid.set(' ');
slen[0] += 1, slen[1] += 1;
} else if (type == '+') {
s_ref.set(Array(l+1).join("-"));
s_qry.set(seq);
s_mid.set(Array(l+1).join(" "));
slen[1] += l;
} else if (type == '-') {
s_ref.set(seq);
s_qry.set(Array(l+1).join("-"));
s_mid.set(Array(l+1).join(" "));
slen[0] += l;
}
}
function print_aln(rs, qs, strand, slen, elen, s_ref, s_qry, s_mid)
{
print(["Ref+:", padding_str(rs + slen[0] + 1, 10, false), s_ref.toString(), padding_str(rs + elen[0], 10, true)].join(" "));
print(" " + s_mid.toString());
var st, en;
if (strand == '+') st = qs + slen[1] + 1, en = qs + elen[1];
else st = qs - slen[1], en = qs - elen[1] + 1;
print(["Qry" + strand + ":", padding_str(st, 10, false), s_qry.toString(), padding_str(en , 10, true)].join(" "));
}
var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes();
var re = /([=\-\+\*])([A-Za-z]+)/g;
var buf = new Bytes();
var file = new File(arguments[getopt.ind]);
if (maf_out) print("##maf version=1\n");
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
var t = line.split("\t", 12);
if ((m = /\tcs:Z:(\S+)/.exec(line)) == null) continue;
var cs = m[1];
s_ref.length = s_qry.length = s_mid.length = 0;
var slen = [0, 0], elen = [0, 0];
if (maf_out) {
while ((m = re.exec(cs)) != null)
update_aln(s_ref, s_qry, s_mid, m[1], m[2], elen);
if (maf_out) {
var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
var len = t[0].length > t[5].length? t[0].length : t[5].length;
print("a " + score);
print(["s", padding_str(t[5], len, true), padding_str(t[7], 10, false), padding_str(parseInt(t[8]) - parseInt(t[7]), 10, false),
"+", padding_str(t[6], 10, false), s_ref.toString()].join(" "));
var qs, qe, ql = parseInt(t[1]);
if (t[4] == '+') {
qs = parseInt(t[2]);
qe = parseInt(t[3]);
} else {
qs = ql - parseInt(t[3]);
qe = ql - parseInt(t[2]);
}
print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false),
t[4], padding_str(ql, 10, false), s_qry.toString()].join(" "));
print("");
}
} else {
line = line.replace(/\tc[sg]:Z:\S+/g, "");
print('>' + line);
var rs = parseInt(t[7]), qs = t[4] == '+'? parseInt(t[2]) : parseInt(t[3]);
var n_blocks = 0;
while ((m = re.exec(cs)) != null) {
var start = 0, rest = m[1] == '*'? 1 : m[2].length;
while (rest > 0) {
var l_proc;
if (s_ref.length + rest >= line_len) {
l_proc = line_len - s_ref.length;
update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
if (n_blocks > 0) print("");
print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
++n_blocks;
s_ref.length = s_qry.length = s_mid.length = 0;
slen[0] = elen[0], slen[1] = elen[1];
} else {
l_proc = rest;
update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
}
rest -= l_proc, start += l_proc;
}
}
if (s_ref.length > 0) {
if (n_blocks > 0) print("");
print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
++n_blocks;
}
print("//");
}
}
file.close();
buf.destroy();
s_ref.destroy(); s_qry.destroy(); s_mid.destroy();

View File

@ -40,7 +40,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);