diff --git a/Makefile b/Makefile index 4118616..90b1cc4 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,10 @@ PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread +ifneq ($(aarch64),) + arm_neon=1 +endif + ifeq ($(arm_neon),) # if arm_neon is not defined ifeq ($(sse2only),) # if sse2only is not defined OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o diff --git a/format.c b/format.c index ac7cd8e..e588814 100644 --- a/format.c +++ b/format.c @@ -119,6 +119,7 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a { kstring_t str = {0,0,0}; int ret = 0; + mm_sprintf_lite(&str, "@HD\tVN:1.6\tSO:unsorted\tGO:query\n"); if (idx) { uint32_t i; for (i = 0; i < idx->n_seq; ++i) diff --git a/index.c b/index.c index 0d8a2ae..c9bd01f 100644 --- a/index.c +++ b/index.c @@ -358,7 +358,7 @@ static void *worker_pipeline(void *shared, int step, void *in) for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; if (t->l_seq > 0) - mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a); + mm_sketch2(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, p->mi->flag&MM_I_SYNCMER, &s->a); else if (mm_verbose >= 2) fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name); free(t->seq); free(t->name); @@ -446,7 +446,7 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha sum_len += p->len; if (p->len > 0) { a.n = 0; - mm_sketch(0, s, p->len, w, k, i, is_hpc, &a); + mm_sketch2(0, s, p->len, w, k, i, is_hpc, 0, &a); // TODO: mm_idx_str() doesn't support syncmer mm_idx_add(mi, a.n, a.a); } } diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index 162e9e2..8f96eb3 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -358,7 +358,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0 // update ez if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; + ez->mte = H[en0], ez->mte_q = r - en0; if (r - st0 == qlen - 1 && H[st0] > ez->mqe) ez->mqe = H[st0], ez->mqe_t = st0; if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e2)) break; diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index c5f9e76..86016cb 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -465,7 +465,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0 // update ez if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; + ez->mte = H[en0], ez->mte_q = r - en0; if (r - st0 == qlen - 1 && H[st0] > ez->mqe) ez->mqe = H[st0], ez->mqe_t = st0; if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break; diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index ad19131..a2154fe 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -269,7 +269,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0 // update ez if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; + ez->mte = H[en0], ez->mte_q = r - en0; if (r - st0 == qlen - 1 && H[st0] > ez->mqe) ez->mqe = H[st0], ez->mqe_t = st0; if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break; diff --git a/lchain.c b/lchain.c index d004157..244d301 100644 --- a/lchain.c +++ b/lchain.c @@ -138,7 +138,7 @@ static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t ma } /* Input: - * a[].x: tid<<33 | rev<<32 | tpos + * a[].x: rev<<63 | tid<<32 | tpos * a[].y: flags<<40 | q_span<<32 | q_pos * Output: * n_u: #chains @@ -251,7 +251,7 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) { int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0, max_drop = bw; - int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0; + int64_t *p, i, i0, st = 0, st_inner = 0; uint64_t *u; lc_elem_t *root = 0, *root_inner = 0; void *mem_mp = 0; @@ -345,7 +345,6 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski } if (!krmq_itr_prev(lc_elem, &itr)) break; } - n_iter += n_rmq_iter; } } } diff --git a/main.c b/main.c index 87be537..1e37d33 100644 --- a/main.c +++ b/main.c @@ -7,8 +7,6 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.24-r1122" - #ifdef __linux__ #include #include @@ -122,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:j:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -154,7 +152,8 @@ int main(int argc, char *argv[]) o = KETOPT_INIT; while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { - if (c == 'w') ipt.w = atoi(o.arg); + if (c == 'w') ipt.w = atoi(o.arg), ipt.flag &= ~MM_I_SYNCMER; + else if (c == 'j') ipt.w = atoi(o.arg), ipt.flag |= MM_I_SYNCMER; else if (c == 'k') ipt.k = atoi(o.arg); else if (c == 'H') ipt.flag |= MM_I_HPC; else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I @@ -263,7 +262,8 @@ int main(int argc, char *argv[]) } else if (c == 326) { // --dual yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); } else if (c == 347) { // --rmq - yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); + if (o.arg) yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); + else opt.flag |= MM_F_RMQ; } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) @@ -324,6 +324,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -H use homopolymer-compressed k-mer (preferrable for PacBio)\n"); fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k); fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w); + fprintf(fp_help, " -j INT syncmer submer size (overriding -w) []\n"); fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n"); fprintf(fp_help, " -d FILE dump index to FILE []\n"); fprintf(fp_help, " Mapping:\n"); diff --git a/map.c b/map.c index 5311468..2342c9e 100644 --- a/map.c +++ b/map.c @@ -67,7 +67,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t mv->n = 0; for (i = n = 0; i < n_segs; ++i) { size_t j; - mm_sketch(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mv); + mm_sketch2(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mi->flag&MM_I_SYNCMER, mv); for (j = n; j < mv->n; ++j) mv->a[j].y += sum << 1; if (opt->sdust_thres > 0) // mask low-complexity minimizers diff --git a/minimap.h b/minimap.h index 9b24ed4..79a76ce 100644 --- a/minimap.h +++ b/minimap.h @@ -5,6 +5,8 @@ #include #include +#define MM_VERSION "2.24-r1155-dirty" + #define MM_F_NO_DIAG 0x001 // no exact diagonal hit #define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name #define MM_F_CIGAR 0x004 @@ -45,6 +47,7 @@ #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 #define MM_I_NO_NAME 0x4 +#define MM_I_SYNCMER 0x8 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index 8a759db..08d9c94 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -79,6 +79,19 @@ Minimizer k-mer length [15] .BI -w \ INT Minimizer window size [10]. A minimizer is the smallest k-mer in a window of w consecutive k-mers. +.TP +.BI -j \ INT +Syncmer submer size [10]. Option +.B -j +and +.B -w +will override each: if +.B -w +is applied after +.BR -j , +.B -j +will have no effect, and vice versa. + .TP .B -H Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by diff --git a/misc/paftools.js b/misc/paftools.js index 6a186b4..bc2f29d 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.24-r1141-dirty'; +var paftools_version = '2.24-r1152-dirty'; /***************************** ***** Library functions ***** @@ -2554,6 +2554,276 @@ function paf_junceval(args) } } +function paf_exoneval(args) // adapted from paf_junceval() +{ + var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false, eval_base = false; + while ((c = getopt(args, "l:epcab1ds")) != null) { + if (c == 'l') l_fuzzy = parseInt(getopt.arg); + else if (c == 'e') print_err_only = print_ovlp = true; + else if (c == 'p') print_ovlp = true; + else if (c == 'c') chr_only = true; + else if (c == 'a') aa = true, use_cds = true; + else if (c == 'b') is_bed = true; + else if (c == '1') first_only = true; + else if (c == 'd') use_cds = true; + else if (c == 's') eval_base = true; + } + + if (args.length - getopt.ind < 1) { + print("Usage: paftools.js exoneval [options] "); + print("Options:"); + print(" -l INT tolerance of junction positions (0 for exact) [0]"); + print(" -d evaluate coding regions only (exon regions by default)"); + print(" -a miniprot PAF as input (force -d)"); + print(" -p print overlapping exons"); + print(" -e print erroreous overlapping exons"); + print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/"); + print(" -1 only process the first alignment of each query"); + print(" -b BED as input"); + print(" -s compute base Sn and Sp (more memory)"); + exit(1); + } + + var file, buf = new Bytes(); + + warn("Reading reference GTF..."); + var tr = {}; + file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0].charAt(0) == '#') continue; + if (use_cds) { + if (t[2] != "cds" && t[2] != "CDS") continue; + } else { + if (t[2] != 'exon') continue; + } + var st = parseInt(t[3]) - 1; + var en = parseInt(t[4]); + if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue; + var tid = m[1]; + if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []]; + tr[tid][4].push([st, en]); // this keeps transcript + } + file.close(); + + var anno = {}; + for (var tid in tr) { // traverse each transcript + var t = tr[tid]; + Interval.sort(t[4]); + t[2] = t[4][0][0]; + t[3] = t[4][t[4].length - 1][1]; + if (anno[t[0]] == null) anno[t[0]] = []; + var s = t[4]; + for (var i = 0; i < s.length; ++i) // traverse each exon + anno[t[0]].push([s[i][0], s[i][1]]); + } + tr = null; + + for (var chr in anno) { // index exons + var e = anno[chr]; + if (e.length == 0) continue; + Interval.sort(e); + var k = 0; + for (var i = 1; i < e.length; ++i) // dedup + if (e[i][0] != e[k][0] || e[i][1] != e[k][1]) + e[++k] = e[i].slice(0); + e.length = k + 1; + Interval.index_end(e); + } + + var n_pri = 0, n_unmapped = 0, n_mapped = 0; + var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0; + + file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]); + var last_qname = null, qexon = {}; + var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g; + + warn("Evaluating alignments..."); + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + var ctg_name = null, cigar = null, pos = null, qname; + + if (t[0].charAt(0) == '@') continue; + if (t[0] == "##PAF") t.shift(); + qname = t[0]; + if (is_bed) { + ctg_name = t[0], pos = parseInt(t[1]), cigar == null; + } else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF + ctg_name = t[5], pos = parseInt(t[7]); + var type = 'P'; + for (i = 12; i < t.length; ++i) { + if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) { + if (m[1] == 'tp:A') type = m[2]; + else cigar = m[2]; + } + } + if (type == 'S') continue; // secondary + } else { // SAM + ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5]; + var flag = parseInt(t[1]); + if (flag&0x100) continue; // secondary + } + + if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue; + if (first_only && last_qname == qname) continue; + if (ctg_name == '*') { // unmapped + ++n_unmapped; + continue; + } else { + ++n_pri; + if (last_qname != qname) { + ++n_mapped; + last_qname = qname; + } + } + + var exon = []; + if (is_bed) { // BED + exon.push([pos, parseInt(t[2])]); + } else if (aa) { + var tmp_exon = [], tmp = 0, tmp_st = 0; + while ((m = re_cigar.exec(cigar)) != null) { + var len = parseInt(m[1]), op = m[2]; + if (op == 'N') { + tmp_exon.push([tmp_st, tmp]); + tmp_st = tmp + len, tmp += len; + } else if (op == 'U') { + tmp_exon.push([tmp_st, tmp + 1]); + tmp_st = tmp + len - 2, tmp += len; + } else if (op == 'V') { + tmp_exon.push([tmp_st, tmp + 2]); + tmp_st = tmp + len - 1, tmp += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') { + tmp += len * 3; + } else if (op == 'F' || op == 'G') { + tmp += len; + } + } + tmp_exon.push([tmp_st, tmp]); + if (t[4] == '+') { + for (var i = 0; i < tmp_exon.length; ++i) + exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]); + } else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them. + var glen = parseInt(t[8]) - parseInt(t[7]); + for (var i = tmp_exon.length - 1; i >= 0; --i) + exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]); + } + } else { + var tmp_st = pos; + while ((m = re_cigar.exec(cigar)) != null) { + var len = parseInt(m[1]), op = m[2]; + if (op == 'N') { + exon.push([tmp_st, pos]); + tmp_st = pos + len, pos += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len; + } + exon.push([tmp_st, pos]); + } + n_exon += exon.length; + + var chr = anno[ctg_name]; + if (chr != null) { + for (var i = 0; i < exon.length; ++i) { + if (eval_base) { + if (qexon[ctg_name] == null) qexon[ctg_name] = []; + qexon[ctg_name].push([exon[i][0], exon[i][1]]); + } + var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]); + if (o.length > 0) { + var hit = false; + for (var j = 0; j < o.length; ++j) { + var st_diff = exon[i][0] - o[j][0]; + var en_diff = exon[i][1] - o[j][1]; + if (st_diff < 0) st_diff = -st_diff; + if (en_diff < 0) en_diff = -en_diff; + if (st_diff <= l_fuzzy && en_diff <= l_fuzzy) + ++n_exon_hit, hit = true; + if (hit) break; + } + if (print_ovlp) { + var type = hit? 'C' : 'P'; + if (hit && print_err_only) continue; + var x = '['; + for (var j = 0; j < o.length; ++j) { + if (j) x += ', '; + x += '(' + o[j][0] + "," + o[j][1] + ')'; + } + x += ']'; + print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x); + } + } else { + ++n_exon_novel; + if (print_ovlp) + print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]); + } + } + } else { + n_exon_novel += exon.length; + } + } + file.close(); + + buf.destroy(); + + if (!print_ovlp) { + print("# unmapped reads: " + n_unmapped); + print("# mapped reads: " + n_mapped); + print("# primary alignments: " + n_pri); + print("# predicted exons: " + n_exon); + print("# non-overlapping exons: " + n_exon_novel); + print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)"); + } + + function merge_and_index(ex) { + for (var chr in ex) { + var a = []; + e = ex[chr]; + Interval.sort(e); + var st = e[0][0], en = e[0][1]; + for (var i = 1; i < e.length; ++i) { // merge + if (e[i][0] > en) { + a.push([st, en]); + st = e[i][0], en = e[i][1]; + } else { + en = en > e[i][1]? en : e[i][1]; + } + } + a.push([st, en]); + Interval.index_end(a); + ex[chr] = a; + } + } + + function cal_sn(a0, a1) { + var tot = 0, cov = 0; + for (var chr in a1) { + var e0 = a0[chr], e1 = a1[chr]; + for (var i = 0; i < e1.length; ++i) + tot += e1[i][1] - e1[i][0]; + if (e0 == null) continue; + for (var i = 0; i < e1.length; ++i) { + var o = Interval.find_ovlp(e0, e1[i][0], e1[i][1]); + for (var j = 0; j < o.length; ++j) { // this only works when there are no overlaps between intervals + var st = e1[i][0] > o[j][0]? e1[i][0] : o[j][0]; + var en = e1[i][1] < o[j][1]? e1[i][1] : o[j][1]; + cov += en - st; + } + } + } + return [tot, cov]; + } + + if (eval_base) { + warn("Computing base Sn and Sp..."); + merge_and_index(qexon); + merge_and_index(anno); + var sn = cal_sn(qexon, anno); + var sp = cal_sn(anno, qexon); + print("Base Sn: " + sn[1] + " / " + sn[0] + " = " + (sn[1] / sn[0] * 100).toFixed(2) + "%"); + print("Base Sp: " + sp[1] + " / " + sp[0] + " = " + (sp[1] / sp[0] * 100).toFixed(2) + "%"); + } +} + // evaluate overlap sensitivity function paf_ov_eval(args) { @@ -2752,7 +3022,6 @@ function paf_misjoin(args) function test_cen_point(cen, chr, x) { var b = cen[chr]; if (b == null) return false; - print(x, b[0][0], b[0][1]); for (var j = 0; j < b.length; ++j) if (x >= b[j][0] && x < b[j][1]) return true; @@ -3362,6 +3631,7 @@ function main(args) print(" mason2fq convert mason2-simulated SAM to FASTQ"); print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ"); print(" junceval evaluate splice junction consistency with known annotations"); + print(" exoneval evaluate exon-level consistency with known annotations"); print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping"); exit(1); } @@ -3386,6 +3656,7 @@ function main(args) else if (cmd == 'mason2fq') paf_mason2fq(args); else if (cmd == 'pbsim2fq') paf_pbsim2fq(args); else if (cmd == 'junceval') paf_junceval(args); + else if (cmd == 'exoneval') paf_exoneval(args); else if (cmd == 'ov-eval') paf_ov_eval(args); else if (cmd == 'vcfstat') paf_vcfstat(args); else if (cmd == 'sveval') paf_sveval(args); diff --git a/mmpriv.h b/mmpriv.h index 2f5034b..7b51b98 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -60,6 +60,8 @@ void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p); +void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p); mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac); diff --git a/sketch.c b/sketch.c index f830693..1f6b7da 100644 --- a/sketch.c +++ b/sketch.c @@ -141,3 +141,58 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i if (min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); } + +void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p) +{ + uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, smask = (1ULL<<2*smer) - 1, kmer[2] = {0,0}; + int i, j, l, buf_pos, min_pos, kmer_span = 0; + tiny_queue_t tq; + + assert(len > 0 && (smer > 0 && smer <= k) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice + memset(&tq, 0, sizeof(tiny_queue_t)); + kv_resize(mm128_t, km, *p, p->n + len/(k - smer)); + + for (i = l = buf_pos = min_pos = 0; i < len; ++i) { + int c = seq_nt4_table[(uint8_t)str[i]]; + if (c < 4) { // not an ambiguous base + int z; + if (is_hpc) { + int skip_len = 1; + if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { + for (skip_len = 2; i + skip_len < len; ++skip_len) + if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) + break; + i += skip_len - 1; // put $i at the end of the current homopolymer run + } + tq_push(&tq, skip_len); + kmer_span += skip_len; + if (tq.count > k) kmer_span -= tq_shift(&tq); + } else kmer_span = l + 1 < k? l + 1 : k; + kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer + kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer + if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand + z = kmer[0] < kmer[1]? 0 : 1; // strand + ++l; + if (l >= k && kmer_span < 256) { + uint64_t x, min = UINT64_MAX; + x = hash64(kmer[z], mask); + for (j = 0; j <= k - smer; ++j) { + uint64_t y = x >> (j + j) & smask; + min = min < y? min : y; + } + if ((x & smask) == min) { + mm128_t t; + t.x = x << 8 | kmer_span; + t.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; + kv_push(mm128_t, km, *p, t); + } + } + } else l = 0, tq.count = tq.front = 0, kmer_span = 0; + } +} + +void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p) +{ + if (is_syncmer) mm_sketch_syncmer(km, str, len, w, k, rid, is_hpc, p); + else mm_sketch(km, str, len, w, k, rid, is_hpc, p); +}