Merge branch 'master' into splice-model

This commit is contained in:
Heng Li 2023-04-06 21:09:18 -04:00
commit 35732f3025
14 changed files with 365 additions and 16 deletions

View File

@ -8,6 +8,10 @@ PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
ifneq ($(aarch64),)
arm_neon=1
endif
ifeq ($(arm_neon),) # if arm_neon is not defined
ifeq ($(sse2only),) # if sse2only is not defined
OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o

View File

@ -119,6 +119,7 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
{
kstring_t str = {0,0,0};
int ret = 0;
mm_sprintf_lite(&str, "@HD\tVN:1.6\tSO:unsorted\tGO:query\n");
if (idx) {
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)

View File

@ -358,7 +358,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
for (i = 0; i < s->n_seq; ++i) {
mm_bseq1_t *t = &s->seq[i];
if (t->l_seq > 0)
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
mm_sketch2(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, p->mi->flag&MM_I_SYNCMER, &s->a);
else if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
free(t->seq); free(t->name);
@ -446,7 +446,7 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
sum_len += p->len;
if (p->len > 0) {
a.n = 0;
mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
mm_sketch2(0, s, p->len, w, k, i, is_hpc, 0, &a); // TODO: mm_idx_str() doesn't support syncmer
mm_idx_add(mi, a.n, a.a);
}
}

View File

@ -358,7 +358,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
} else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0
// update ez
if (en0 == tlen - 1 && H[en0] > ez->mte)
ez->mte = H[en0], ez->mte_q = r - en;
ez->mte = H[en0], ez->mte_q = r - en0;
if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
ez->mqe = H[st0], ez->mqe_t = st0;
if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e2)) break;

View File

@ -465,7 +465,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
} else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0
// update ez
if (en0 == tlen - 1 && H[en0] > ez->mte)
ez->mte = H[en0], ez->mte_q = r - en;
ez->mte = H[en0], ez->mte_q = r - en0;
if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
ez->mqe = H[st0], ez->mqe_t = st0;
if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break;

View File

@ -269,7 +269,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
} else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0
// update ez
if (en0 == tlen - 1 && H[en0] > ez->mte)
ez->mte = H[en0], ez->mte_q = r - en;
ez->mte = H[en0], ez->mte_q = r - en0;
if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
ez->mqe = H[st0], ez->mqe_t = st0;
if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break;

View File

@ -138,7 +138,7 @@ static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t ma
}
/* Input:
* a[].x: tid<<33 | rev<<32 | tpos
* a[].x: rev<<63 | tid<<32 | tpos
* a[].y: flags<<40 | q_span<<32 | q_pos
* Output:
* n_u: #chains
@ -251,7 +251,7 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{
int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0, max_drop = bw;
int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0;
int64_t *p, i, i0, st = 0, st_inner = 0;
uint64_t *u;
lc_elem_t *root = 0, *root_inner = 0;
void *mem_mp = 0;
@ -345,7 +345,6 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
}
if (!krmq_itr_prev(lc_elem, &itr)) break;
}
n_iter += n_rmq_iter;
}
}
}

11
main.c
View File

@ -7,8 +7,6 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.24-r1122"
#ifdef __linux__
#include <sys/resource.h>
#include <sys/time.h>
@ -122,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:j:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
@ -154,7 +152,8 @@ int main(int argc, char *argv[])
o = KETOPT_INIT;
while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) {
if (c == 'w') ipt.w = atoi(o.arg);
if (c == 'w') ipt.w = atoi(o.arg), ipt.flag &= ~MM_I_SYNCMER;
else if (c == 'j') ipt.w = atoi(o.arg), ipt.flag |= MM_I_SYNCMER;
else if (c == 'k') ipt.k = atoi(o.arg);
else if (c == 'H') ipt.flag |= MM_I_HPC;
else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I
@ -263,7 +262,8 @@ int main(int argc, char *argv[])
} else if (c == 326) { // --dual
yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0);
} else if (c == 347) { // --rmq
yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
if (o.arg) yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
else opt.flag |= MM_F_RMQ;
} else if (c == 'S') {
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
if (mm_verbose >= 2)
@ -324,6 +324,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -H use homopolymer-compressed k-mer (preferrable for PacBio)\n");
fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k);
fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w);
fprintf(fp_help, " -j INT syncmer submer size (overriding -w) []\n");
fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(fp_help, " -d FILE dump index to FILE []\n");
fprintf(fp_help, " Mapping:\n");

2
map.c
View File

@ -67,7 +67,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
mv->n = 0;
for (i = n = 0; i < n_segs; ++i) {
size_t j;
mm_sketch(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mv);
mm_sketch2(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mi->flag&MM_I_SYNCMER, mv);
for (j = n; j < mv->n; ++j)
mv->a[j].y += sum << 1;
if (opt->sdust_thres > 0) // mask low-complexity minimizers

View File

@ -5,6 +5,8 @@
#include <stdio.h>
#include <sys/types.h>
#define MM_VERSION "2.24-r1155-dirty"
#define MM_F_NO_DIAG 0x001 // no exact diagonal hit
#define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name
#define MM_F_CIGAR 0x004
@ -45,6 +47,7 @@
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
#define MM_I_NO_NAME 0x4
#define MM_I_SYNCMER 0x8
#define MM_IDX_MAGIC "MMI\2"

View File

@ -79,6 +79,19 @@ Minimizer k-mer length [15]
.BI -w \ INT
Minimizer window size [10]. A minimizer is the smallest k-mer
in a window of w consecutive k-mers.
.TP
.BI -j \ INT
Syncmer submer size [10]. Option
.B -j
and
.B -w
will override each: if
.B -w
is applied after
.BR -j ,
.B -j
will have no effect, and vice versa.
.TP
.B -H
Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by

View File

@ -1,6 +1,6 @@
#!/usr/bin/env k8
var paftools_version = '2.24-r1141-dirty';
var paftools_version = '2.24-r1152-dirty';
/*****************************
***** Library functions *****
@ -2554,6 +2554,276 @@ function paf_junceval(args)
}
}
function paf_exoneval(args) // adapted from paf_junceval()
{
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false, eval_base = false;
while ((c = getopt(args, "l:epcab1ds")) != null) {
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
else if (c == 'e') print_err_only = print_ovlp = true;
else if (c == 'p') print_ovlp = true;
else if (c == 'c') chr_only = true;
else if (c == 'a') aa = true, use_cds = true;
else if (c == 'b') is_bed = true;
else if (c == '1') first_only = true;
else if (c == 'd') use_cds = true;
else if (c == 's') eval_base = true;
}
if (args.length - getopt.ind < 1) {
print("Usage: paftools.js exoneval [options] <gene.gtf> <aln.sam>");
print("Options:");
print(" -l INT tolerance of junction positions (0 for exact) [0]");
print(" -d evaluate coding regions only (exon regions by default)");
print(" -a miniprot PAF as input (force -d)");
print(" -p print overlapping exons");
print(" -e print erroreous overlapping exons");
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
print(" -1 only process the first alignment of each query");
print(" -b BED as input");
print(" -s compute base Sn and Sp (more memory)");
exit(1);
}
var file, buf = new Bytes();
warn("Reading reference GTF...");
var tr = {};
file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
if (t[0].charAt(0) == '#') continue;
if (use_cds) {
if (t[2] != "cds" && t[2] != "CDS") continue;
} else {
if (t[2] != 'exon') continue;
}
var st = parseInt(t[3]) - 1;
var en = parseInt(t[4]);
if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
var tid = m[1];
if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
tr[tid][4].push([st, en]); // this keeps transcript
}
file.close();
var anno = {};
for (var tid in tr) { // traverse each transcript
var t = tr[tid];
Interval.sort(t[4]);
t[2] = t[4][0][0];
t[3] = t[4][t[4].length - 1][1];
if (anno[t[0]] == null) anno[t[0]] = [];
var s = t[4];
for (var i = 0; i < s.length; ++i) // traverse each exon
anno[t[0]].push([s[i][0], s[i][1]]);
}
tr = null;
for (var chr in anno) { // index exons
var e = anno[chr];
if (e.length == 0) continue;
Interval.sort(e);
var k = 0;
for (var i = 1; i < e.length; ++i) // dedup
if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
e[++k] = e[i].slice(0);
e.length = k + 1;
Interval.index_end(e);
}
var n_pri = 0, n_unmapped = 0, n_mapped = 0;
var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0;
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
var last_qname = null, qexon = {};
var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
warn("Evaluating alignments...");
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
var ctg_name = null, cigar = null, pos = null, qname;
if (t[0].charAt(0) == '@') continue;
if (t[0] == "##PAF") t.shift();
qname = t[0];
if (is_bed) {
ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
} else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
ctg_name = t[5], pos = parseInt(t[7]);
var type = 'P';
for (i = 12; i < t.length; ++i) {
if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
if (m[1] == 'tp:A') type = m[2];
else cigar = m[2];
}
}
if (type == 'S') continue; // secondary
} else { // SAM
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
var flag = parseInt(t[1]);
if (flag&0x100) continue; // secondary
}
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
if (first_only && last_qname == qname) continue;
if (ctg_name == '*') { // unmapped
++n_unmapped;
continue;
} else {
++n_pri;
if (last_qname != qname) {
++n_mapped;
last_qname = qname;
}
}
var exon = [];
if (is_bed) { // BED
exon.push([pos, parseInt(t[2])]);
} else if (aa) {
var tmp_exon = [], tmp = 0, tmp_st = 0;
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
tmp_exon.push([tmp_st, tmp]);
tmp_st = tmp + len, tmp += len;
} else if (op == 'U') {
tmp_exon.push([tmp_st, tmp + 1]);
tmp_st = tmp + len - 2, tmp += len;
} else if (op == 'V') {
tmp_exon.push([tmp_st, tmp + 2]);
tmp_st = tmp + len - 1, tmp += len;
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
tmp += len * 3;
} else if (op == 'F' || op == 'G') {
tmp += len;
}
}
tmp_exon.push([tmp_st, tmp]);
if (t[4] == '+') {
for (var i = 0; i < tmp_exon.length; ++i)
exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]);
} else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them.
var glen = parseInt(t[8]) - parseInt(t[7]);
for (var i = tmp_exon.length - 1; i >= 0; --i)
exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]);
}
} else {
var tmp_st = pos;
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
exon.push([tmp_st, pos]);
tmp_st = pos + len, pos += len;
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
}
exon.push([tmp_st, pos]);
}
n_exon += exon.length;
var chr = anno[ctg_name];
if (chr != null) {
for (var i = 0; i < exon.length; ++i) {
if (eval_base) {
if (qexon[ctg_name] == null) qexon[ctg_name] = [];
qexon[ctg_name].push([exon[i][0], exon[i][1]]);
}
var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]);
if (o.length > 0) {
var hit = false;
for (var j = 0; j < o.length; ++j) {
var st_diff = exon[i][0] - o[j][0];
var en_diff = exon[i][1] - o[j][1];
if (st_diff < 0) st_diff = -st_diff;
if (en_diff < 0) en_diff = -en_diff;
if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
++n_exon_hit, hit = true;
if (hit) break;
}
if (print_ovlp) {
var type = hit? 'C' : 'P';
if (hit && print_err_only) continue;
var x = '[';
for (var j = 0; j < o.length; ++j) {
if (j) x += ', ';
x += '(' + o[j][0] + "," + o[j][1] + ')';
}
x += ']';
print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x);
}
} else {
++n_exon_novel;
if (print_ovlp)
print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]);
}
}
} else {
n_exon_novel += exon.length;
}
}
file.close();
buf.destroy();
if (!print_ovlp) {
print("# unmapped reads: " + n_unmapped);
print("# mapped reads: " + n_mapped);
print("# primary alignments: " + n_pri);
print("# predicted exons: " + n_exon);
print("# non-overlapping exons: " + n_exon_novel);
print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)");
}
function merge_and_index(ex) {
for (var chr in ex) {
var a = [];
e = ex[chr];
Interval.sort(e);
var st = e[0][0], en = e[0][1];
for (var i = 1; i < e.length; ++i) { // merge
if (e[i][0] > en) {
a.push([st, en]);
st = e[i][0], en = e[i][1];
} else {
en = en > e[i][1]? en : e[i][1];
}
}
a.push([st, en]);
Interval.index_end(a);
ex[chr] = a;
}
}
function cal_sn(a0, a1) {
var tot = 0, cov = 0;
for (var chr in a1) {
var e0 = a0[chr], e1 = a1[chr];
for (var i = 0; i < e1.length; ++i)
tot += e1[i][1] - e1[i][0];
if (e0 == null) continue;
for (var i = 0; i < e1.length; ++i) {
var o = Interval.find_ovlp(e0, e1[i][0], e1[i][1]);
for (var j = 0; j < o.length; ++j) { // this only works when there are no overlaps between intervals
var st = e1[i][0] > o[j][0]? e1[i][0] : o[j][0];
var en = e1[i][1] < o[j][1]? e1[i][1] : o[j][1];
cov += en - st;
}
}
}
return [tot, cov];
}
if (eval_base) {
warn("Computing base Sn and Sp...");
merge_and_index(qexon);
merge_and_index(anno);
var sn = cal_sn(qexon, anno);
var sp = cal_sn(anno, qexon);
print("Base Sn: " + sn[1] + " / " + sn[0] + " = " + (sn[1] / sn[0] * 100).toFixed(2) + "%");
print("Base Sp: " + sp[1] + " / " + sp[0] + " = " + (sp[1] / sp[0] * 100).toFixed(2) + "%");
}
}
// evaluate overlap sensitivity
function paf_ov_eval(args)
{
@ -2752,7 +3022,6 @@ function paf_misjoin(args)
function test_cen_point(cen, chr, x) {
var b = cen[chr];
if (b == null) return false;
print(x, b[0][0], b[0][1]);
for (var j = 0; j < b.length; ++j)
if (x >= b[j][0] && x < b[j][1])
return true;
@ -3362,6 +3631,7 @@ function main(args)
print(" mason2fq convert mason2-simulated SAM to FASTQ");
print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
print(" junceval evaluate splice junction consistency with known annotations");
print(" exoneval evaluate exon-level consistency with known annotations");
print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
exit(1);
}
@ -3386,6 +3656,7 @@ function main(args)
else if (cmd == 'mason2fq') paf_mason2fq(args);
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
else if (cmd == 'junceval') paf_junceval(args);
else if (cmd == 'exoneval') paf_exoneval(args);
else if (cmd == 'ov-eval') paf_ov_eval(args);
else if (cmd == 'vcfstat') paf_vcfstat(args);
else if (cmd == 'sveval') paf_sveval(args);

View File

@ -60,6 +60,8 @@ void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p);
void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p);
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac);

View File

@ -141,3 +141,58 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
if (min.x != UINT64_MAX)
kv_push(mm128_t, km, *p, min);
}
void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p)
{
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, smask = (1ULL<<2*smer) - 1, kmer[2] = {0,0};
int i, j, l, buf_pos, min_pos, kmer_span = 0;
tiny_queue_t tq;
assert(len > 0 && (smer > 0 && smer <= k) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
memset(&tq, 0, sizeof(tiny_queue_t));
kv_resize(mm128_t, km, *p, p->n + len/(k - smer));
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
int c = seq_nt4_table[(uint8_t)str[i]];
if (c < 4) { // not an ambiguous base
int z;
if (is_hpc) {
int skip_len = 1;
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
for (skip_len = 2; i + skip_len < len; ++skip_len)
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
break;
i += skip_len - 1; // put $i at the end of the current homopolymer run
}
tq_push(&tq, skip_len);
kmer_span += skip_len;
if (tq.count > k) kmer_span -= tq_shift(&tq);
} else kmer_span = l + 1 < k? l + 1 : k;
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
z = kmer[0] < kmer[1]? 0 : 1; // strand
++l;
if (l >= k && kmer_span < 256) {
uint64_t x, min = UINT64_MAX;
x = hash64(kmer[z], mask);
for (j = 0; j <= k - smer; ++j) {
uint64_t y = x >> (j + j) & smask;
min = min < y? min : y;
}
if ((x & smask) == min) {
mm128_t t;
t.x = x << 8 | kmer_span;
t.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
kv_push(mm128_t, km, *p, t);
}
}
} else l = 0, tq.count = tq.front = 0, kmer_span = 0;
}
}
void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p)
{
if (is_syncmer) mm_sketch_syncmer(km, str, len, w, k, rid, is_hpc, p);
else mm_sketch(km, str, len, w, k, rid, is_hpc, p);
}