一些自动格式更改,添加一些注释等
This commit is contained in:
parent
f588745484
commit
ad177f2165
|
|
@ -1,3 +1,5 @@
|
|||
*.paf
|
||||
*.sam
|
||||
.cproject
|
||||
.project
|
||||
.*.swp
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Launch",
|
||||
"name": "read overlap",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
|
|
@ -14,13 +14,31 @@
|
|||
"-x",
|
||||
"ava-ont",
|
||||
"-t",
|
||||
"1",
|
||||
"4",
|
||||
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
||||
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
||||
"-o",
|
||||
"reads.paf"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
{
|
||||
"name": "mapping",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/minimap2",
|
||||
"args": [
|
||||
"-ax",
|
||||
"map-ont",
|
||||
"-t",
|
||||
"1",
|
||||
"/public/home/zzh/work/3gseq/TGM-2021YFF/reads.fasta",
|
||||
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
||||
"-o",
|
||||
"aln.sam"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
{
|
||||
"files.associations": {
|
||||
"minimap.h": "c",
|
||||
"time.h": "c"
|
||||
"time.h": "c",
|
||||
"kalloc.h": "c"
|
||||
}
|
||||
}
|
||||
10
lchain.c
10
lchain.c
|
|
@ -5,7 +5,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "kalloc.h"
|
||||
#include "krmq.h"
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
extern int64_t get_mseconds();
|
||||
extern int64_t time_mg_lchain_dp,
|
||||
time_mg_chain_backtrack;
|
||||
|
|
@ -155,7 +155,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|||
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
|
||||
int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
|
||||
uint64_t *u;
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||
#endif
|
||||
if (_u) *_u = 0, *n_u_ = 0;
|
||||
|
|
@ -211,17 +211,17 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|||
max_ii = i;
|
||||
if (mmax_f < max_f) mmax_f = max_f;
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_inner_time = get_mseconds();
|
||||
#endif
|
||||
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_inner_time;
|
||||
__sync_fetch_and_add(&time_mg_chain_backtrack, tmp_diff);
|
||||
#endif
|
||||
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
||||
kfree(km, p); kfree(km, f); kfree(km, t);
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mg_lchain_dp, tmp_diff);
|
||||
#endif
|
||||
|
|
|
|||
12
main.c
12
main.c
|
|
@ -18,7 +18,7 @@ int64_t get_mseconds()
|
|||
}
|
||||
|
||||
// 记录运行时间的变量
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
|
||||
int64_t time_mm_idx_reader_read,
|
||||
time_mm_map_file_frag,
|
||||
|
|
@ -34,7 +34,8 @@ int64_t time_mm_idx_reader_read,
|
|||
time_mg_lchain_dp = 0,
|
||||
time_collect_seed_hits_heap = 0,
|
||||
time_collect_seed_hits = 0,
|
||||
time_mg_chain_backtrack = 0;
|
||||
time_mg_chain_backtrack = 0,
|
||||
time_ksw_extd2_sse = 0;
|
||||
|
||||
#endif
|
||||
//////////////////////////////////
|
||||
|
|
@ -181,7 +182,7 @@ int main(int argc, char *argv[])
|
|||
mm_realtime0 = realtime();
|
||||
mm_set_opt(0, &ipt, &opt);
|
||||
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
|
||||
time_mm_idx_reader_read = 0;
|
||||
time_mm_map_file_frag = 0;
|
||||
|
|
@ -699,7 +700,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, " %s", argv[i]);
|
||||
fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB\n", __func__, realtime() - mm_realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0);
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fprintf(stderr, "time_mm_idx_reader_read: %f s\n", time_mm_idx_reader_read / 1000.0);
|
||||
|
|
@ -717,7 +718,8 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, "time_collect_seed_hits: %f s\n", time_collect_seed_hits / 1000.0 / n_threads);
|
||||
fprintf(stderr, "time_mg_lchain_dp: %f s\n", time_mg_lchain_dp / 1000.0 / n_threads);
|
||||
fprintf(stderr, "time_mg_chain_backtrack: %f s\n", time_mg_chain_backtrack / 1000.0 / n_threads);
|
||||
|
||||
fprintf(stderr, "time_ksw_extd2_sse: %f s\n", time_ksw_extd2_sse / 1000.0 / n_threads);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
return 0;
|
||||
|
|
|
|||
44
map.c
44
map.c
|
|
@ -10,7 +10,7 @@
|
|||
#include "bseq.h"
|
||||
#include "khash.h"
|
||||
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
extern int64_t get_mseconds();
|
||||
extern int64_t time_mm_map_file_frag,
|
||||
time_map_work_for_block_1,
|
||||
|
|
@ -145,7 +145,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
|||
int64_t j, n_for = 0, n_rev = 0;
|
||||
mm_seed_t *m;
|
||||
mm128_t *a, *heap;
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||
#endif
|
||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
|
|
@ -217,7 +217,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
|||
memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mm128_t));
|
||||
*n_a = n_for + n_rev;
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_collect_seed_hits_heap, tmp_diff);
|
||||
#endif
|
||||
|
|
@ -230,14 +230,10 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
|||
int i, n_m;
|
||||
mm_seed_t *m;
|
||||
mm128_t *a;
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||
#endif
|
||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
#ifdef ANALYSIS_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_collect_seed_hits, tmp_diff);
|
||||
#endif
|
||||
a = (mm128_t *)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||
for (i = 0, *n_a = 0; i < n_m; ++i)
|
||||
{
|
||||
|
|
@ -276,6 +272,10 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
|||
}
|
||||
kfree(km, m);
|
||||
radix_sort_128x(a, a + (*n_a));
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_collect_seed_hits, tmp_diff);
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
|
||||
|
|
@ -317,7 +317,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
mm_reg1_t *regs0;
|
||||
km_stat_t kmst;
|
||||
float chn_pen_gap, chn_pen_skip;
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||
#endif
|
||||
for (i = 0, qlen_sum = 0; i < n_segs; ++i)
|
||||
|
|
@ -333,13 +333,13 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
hash = __ac_Wang_hash(hash);
|
||||
|
||||
collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv);
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mm_map_frag_b1, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
#endif
|
||||
if (opt->q_occ_frac > 0.0f)
|
||||
mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac);
|
||||
mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac); // 过滤掉出现次数太多的minimizer
|
||||
if (opt->flag & MM_F_HEAP_SORT)
|
||||
a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
||||
else
|
||||
|
|
@ -352,7 +352,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x << 1 >> 33].name, (int32_t)a[i].x, "+-"[a[i].x >> 63], (int32_t)a[i].y, (int32_t)(a[i].y >> 32 & 0xff),
|
||||
i == 0 ? 0 : ((int32_t)a[i].y - (int32_t)a[i - 1].y) - ((int32_t)a[i].x - (int32_t)a[i - 1].x));
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mm_map_frag_b2, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -387,7 +387,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
|
||||
chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mm_map_frag_b3, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -441,7 +441,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
}
|
||||
b->frag_gap = max_chain_gap_ref;
|
||||
b->rep_len = rep_len;
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mm_map_frag_b4, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -465,7 +465,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos);
|
||||
n_regs0 = mm_filter_strand_retained(n_regs0, regs0);
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mm_map_frag_b5, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -492,7 +492,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag & MM_F_CIGAR))
|
||||
mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_mm_map_frag_b6, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -566,7 +566,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
|||
fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
|
||||
t = realtime();
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||
#endif
|
||||
for (j = 0; j < s->n_seg[i]; ++j)
|
||||
|
|
@ -576,7 +576,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
|||
qlens[j] = s->seq[off + j].l_seq;
|
||||
qseqs[j] = s->seq[off + j].seq;
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_map_work_for_block_1, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -599,7 +599,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
|||
s->frag_gap[off + j] = b->frag_gap;
|
||||
}
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_map_work_for_block_2, tmp_diff);
|
||||
tmp_cur_time = get_mseconds();
|
||||
|
|
@ -618,7 +618,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
|||
r->rev = !r->rev;
|
||||
}
|
||||
}
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||
__sync_fetch_and_add(&time_map_work_for_block_3, tmp_diff);
|
||||
#endif
|
||||
|
|
@ -851,7 +851,7 @@ static mm_bseq_file_t **open_bseqs(int n, const char **fn)
|
|||
|
||||
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads)
|
||||
{
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_cur_time = get_mseconds();
|
||||
#endif
|
||||
int i, pl_threads;
|
||||
|
|
@ -878,7 +878,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
|
|||
for (i = 0; i < pl.n_fp; ++i)
|
||||
mm_bseq_close(pl.fp[i]);
|
||||
free(pl.fp);
|
||||
#ifdef ANALYSIS_PERF
|
||||
#ifdef SHOW_PERF
|
||||
time_mm_map_file_frag += get_mseconds() - tmp_cur_time;
|
||||
#endif
|
||||
return 0;
|
||||
|
|
|
|||
723
minimap.h
723
minimap.h
|
|
@ -8,413 +8,430 @@
|
|||
#define MM_VERSION "2.26-r1175"
|
||||
|
||||
// 用来开关调试性能分析,运行时间等信息
|
||||
#define ANALYSIS_PERF 1
|
||||
#define SHOW_PERF 1
|
||||
|
||||
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
||||
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
||||
#define MM_F_CIGAR (0x004LL)
|
||||
#define MM_F_OUT_SAM (0x008LL)
|
||||
#define MM_F_NO_QUAL (0x010LL)
|
||||
#define MM_F_OUT_CG (0x020LL)
|
||||
#define MM_F_OUT_CS (0x040LL)
|
||||
#define MM_F_SPLICE (0x080LL) // splice mode
|
||||
#define MM_F_SPLICE_FOR (0x100LL) // match GT-AG
|
||||
#define MM_F_SPLICE_REV (0x200LL) // match CT-AC, the reverse complement of GT-AG
|
||||
#define MM_F_NO_LJOIN (0x400LL)
|
||||
#define MM_F_OUT_CS_LONG (0x800LL)
|
||||
#define MM_F_SR (0x1000LL)
|
||||
#define MM_F_FRAG_MODE (0x2000LL)
|
||||
#define MM_F_NO_PRINT_2ND (0x4000LL)
|
||||
#define MM_F_2_IO_THREADS (0x8000LL)
|
||||
#define MM_F_LONG_CIGAR (0x10000LL)
|
||||
#define MM_F_INDEPEND_SEG (0x20000LL)
|
||||
#define MM_F_SPLICE_FLANK (0x40000LL)
|
||||
#define MM_F_SOFTCLIP (0x80000LL)
|
||||
#define MM_F_FOR_ONLY (0x100000LL)
|
||||
#define MM_F_REV_ONLY (0x200000LL)
|
||||
#define MM_F_HEAP_SORT (0x400000LL)
|
||||
#define MM_F_ALL_CHAINS (0x800000LL)
|
||||
#define MM_F_OUT_MD (0x1000000LL)
|
||||
#define MM_F_COPY_COMMENT (0x2000000LL)
|
||||
#define MM_F_EQX (0x4000000LL) // use =/X instead of M
|
||||
#define MM_F_PAF_NO_HIT (0x8000000LL) // output unmapped reads to PAF
|
||||
#define MM_F_NO_END_FLT (0x10000000LL)
|
||||
#define MM_F_HARD_MLEVEL (0x20000000LL)
|
||||
#define MM_F_SAM_HIT_ONLY (0x40000000LL)
|
||||
#define MM_F_RMQ (0x80000000LL)
|
||||
#define MM_F_QSTRAND (0x100000000LL)
|
||||
#define MM_F_NO_INV (0x200000000LL)
|
||||
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
||||
#define MM_F_SPLICE_OLD (0x800000000LL)
|
||||
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
||||
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
||||
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
||||
#define MM_F_CIGAR (0x004LL)
|
||||
#define MM_F_OUT_SAM (0x008LL)
|
||||
#define MM_F_NO_QUAL (0x010LL)
|
||||
#define MM_F_OUT_CG (0x020LL)
|
||||
#define MM_F_OUT_CS (0x040LL)
|
||||
#define MM_F_SPLICE (0x080LL) // splice mode
|
||||
#define MM_F_SPLICE_FOR (0x100LL) // match GT-AG
|
||||
#define MM_F_SPLICE_REV (0x200LL) // match CT-AC, the reverse complement of GT-AG
|
||||
#define MM_F_NO_LJOIN (0x400LL)
|
||||
#define MM_F_OUT_CS_LONG (0x800LL)
|
||||
#define MM_F_SR (0x1000LL)
|
||||
#define MM_F_FRAG_MODE (0x2000LL)
|
||||
#define MM_F_NO_PRINT_2ND (0x4000LL)
|
||||
#define MM_F_2_IO_THREADS (0x8000LL)
|
||||
#define MM_F_LONG_CIGAR (0x10000LL)
|
||||
#define MM_F_INDEPEND_SEG (0x20000LL)
|
||||
#define MM_F_SPLICE_FLANK (0x40000LL)
|
||||
#define MM_F_SOFTCLIP (0x80000LL)
|
||||
#define MM_F_FOR_ONLY (0x100000LL)
|
||||
#define MM_F_REV_ONLY (0x200000LL)
|
||||
#define MM_F_HEAP_SORT (0x400000LL)
|
||||
#define MM_F_ALL_CHAINS (0x800000LL)
|
||||
#define MM_F_OUT_MD (0x1000000LL)
|
||||
#define MM_F_COPY_COMMENT (0x2000000LL)
|
||||
#define MM_F_EQX (0x4000000LL) // use =/X instead of M
|
||||
#define MM_F_PAF_NO_HIT (0x8000000LL) // output unmapped reads to PAF
|
||||
#define MM_F_NO_END_FLT (0x10000000LL)
|
||||
#define MM_F_HARD_MLEVEL (0x20000000LL)
|
||||
#define MM_F_SAM_HIT_ONLY (0x40000000LL)
|
||||
#define MM_F_RMQ (0x80000000LL)
|
||||
#define MM_F_QSTRAND (0x100000000LL)
|
||||
#define MM_F_NO_INV (0x200000000LL)
|
||||
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
||||
#define MM_F_SPLICE_OLD (0x800000000LL)
|
||||
#define MM_F_SECONDARY_SEQ (0x1000000000LL) // output SEQ field for seqondary alignments using hard clipping
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
#define MM_I_NO_NAME 0x4
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
#define MM_I_NO_NAME 0x4
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
#define MM_MAX_SEG 255
|
||||
#define MM_MAX_SEG 255
|
||||
|
||||
#define MM_CIGAR_MATCH 0
|
||||
#define MM_CIGAR_INS 1
|
||||
#define MM_CIGAR_DEL 2
|
||||
#define MM_CIGAR_N_SKIP 3
|
||||
#define MM_CIGAR_SOFTCLIP 4
|
||||
#define MM_CIGAR_HARDCLIP 5
|
||||
#define MM_CIGAR_PADDING 6
|
||||
#define MM_CIGAR_EQ_MATCH 7
|
||||
#define MM_CIGAR_MATCH 0
|
||||
#define MM_CIGAR_INS 1
|
||||
#define MM_CIGAR_DEL 2
|
||||
#define MM_CIGAR_N_SKIP 3
|
||||
#define MM_CIGAR_SOFTCLIP 4
|
||||
#define MM_CIGAR_HARDCLIP 5
|
||||
#define MM_CIGAR_PADDING 6
|
||||
#define MM_CIGAR_EQ_MATCH 7
|
||||
#define MM_CIGAR_X_MISMATCH 8
|
||||
|
||||
#define MM_CIGAR_STR "MIDNSHP=XB"
|
||||
#define MM_CIGAR_STR "MIDNSHP=XB"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
// emulate 128-bit integers and arrays
|
||||
typedef struct { uint64_t x, y; } mm128_t;
|
||||
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
|
||||
// emulate 128-bit integers and arrays
|
||||
typedef struct
|
||||
{
|
||||
uint64_t x, y;
|
||||
} mm128_t;
|
||||
typedef struct
|
||||
{
|
||||
size_t n, m;
|
||||
mm128_t *a;
|
||||
} mm128_v;
|
||||
|
||||
// minimap2 index
|
||||
typedef struct {
|
||||
char *name; // name of the db sequence
|
||||
uint64_t offset; // offset in mm_idx_t::S
|
||||
uint32_t len; // length
|
||||
uint32_t is_alt;
|
||||
} mm_idx_seq_t;
|
||||
// minimap2 index
|
||||
typedef struct
|
||||
{
|
||||
char *name; // name of the db sequence
|
||||
uint64_t offset; // offset in mm_idx_t::S
|
||||
uint32_t len; // length
|
||||
uint32_t is_alt;
|
||||
} mm_idx_seq_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t b, w, k, flag;
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
int32_t index;
|
||||
int32_t n_alt;
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
struct mm_idx_bucket_s *B; // index (hidden)
|
||||
struct mm_idx_intv_s *I; // intervals (hidden)
|
||||
void *km, *h;
|
||||
} mm_idx_t;
|
||||
typedef struct
|
||||
{
|
||||
int32_t b, w, k, flag;
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
int32_t index;
|
||||
int32_t n_alt;
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
struct mm_idx_bucket_s *B; // index (hidden)
|
||||
struct mm_idx_intv_s *I; // intervals (hidden)
|
||||
void *km, *h;
|
||||
} mm_idx_t;
|
||||
|
||||
// minimap2 alignment
|
||||
typedef struct {
|
||||
uint32_t capacity; // the capacity of cigar[]
|
||||
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
||||
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
||||
uint32_t n_cigar; // number of cigar operations in cigar[]
|
||||
uint32_t cigar[];
|
||||
} mm_extra_t;
|
||||
// minimap2 alignment
|
||||
typedef struct
|
||||
{
|
||||
uint32_t capacity; // the capacity of cigar[]
|
||||
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
||||
uint32_t n_ambi : 30, trans_strand : 2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
||||
uint32_t n_cigar; // number of cigar operations in cigar[]
|
||||
uint32_t cigar[];
|
||||
} mm_extra_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t id; // ID for internal uses (see also parent below)
|
||||
int32_t cnt; // number of minimizers; if on the reverse strand
|
||||
int32_t rid; // reference index; if this is an alignment from inversion rescue
|
||||
int32_t score; // DP alignment score
|
||||
int32_t qs, qe, rs, re; // query start and end; reference start and end
|
||||
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
|
||||
int32_t as; // offset in the a[] array (for internal uses only)
|
||||
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
|
||||
int32_t n_sub; // number of suboptimal mappings
|
||||
int32_t score0; // initial chaining score (before chain merging/spliting)
|
||||
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
|
||||
uint32_t hash;
|
||||
float div;
|
||||
mm_extra_t *p;
|
||||
} mm_reg1_t;
|
||||
typedef struct
|
||||
{
|
||||
int32_t id; // ID for internal uses (see also parent below)
|
||||
int32_t cnt; // number of minimizers; if on the reverse strand
|
||||
int32_t rid; // reference index; if this is an alignment from inversion rescue
|
||||
int32_t score; // DP alignment score
|
||||
int32_t qs, qe, rs, re; // query start and end; reference start and end
|
||||
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
|
||||
int32_t as; // offset in the a[] array (for internal uses only)
|
||||
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
|
||||
int32_t n_sub; // number of suboptimal mappings
|
||||
int32_t score0; // initial chaining score (before chain merging/spliting)
|
||||
uint32_t mapq : 8, split : 2, rev : 1, inv : 1, sam_pri : 1, proper_frag : 1, pe_thru : 1, seg_split : 1, seg_id : 8, split_inv : 1, is_alt : 1, strand_retained : 1, dummy : 5;
|
||||
uint32_t hash;
|
||||
float div;
|
||||
mm_extra_t *p;
|
||||
} mm_reg1_t;
|
||||
|
||||
// indexing and mapping options
|
||||
typedef struct {
|
||||
short k, w, flag, bucket_bits;
|
||||
int64_t mini_batch_size;
|
||||
uint64_t batch_size;
|
||||
} mm_idxopt_t;
|
||||
// indexing and mapping options
|
||||
typedef struct
|
||||
{
|
||||
short k, w, flag, bucket_bits;
|
||||
int64_t mini_batch_size;
|
||||
uint64_t batch_size;
|
||||
} mm_idxopt_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t flag; // see MM_F_* macros
|
||||
int seed;
|
||||
int sdust_thres; // score threshold for SDUST; 0 to disable
|
||||
typedef struct
|
||||
{
|
||||
int64_t flag; // see MM_F_* macros
|
||||
int seed;
|
||||
int sdust_thres; // score threshold for SDUST; 0 to disable
|
||||
|
||||
int max_qlen; // max query length
|
||||
int max_qlen; // max query length
|
||||
|
||||
int bw, bw_long; // bandwidth
|
||||
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
|
||||
int max_frag_len;
|
||||
int max_chain_skip, max_chain_iter;
|
||||
int min_cnt; // min number of minimizers on each chain
|
||||
int min_chain_score; // min chaining score
|
||||
float chain_gap_scale;
|
||||
float chain_skip_scale;
|
||||
int rmq_size_cap, rmq_inner_dist;
|
||||
int rmq_rescue_size;
|
||||
float rmq_rescue_ratio;
|
||||
int bw, bw_long; // bandwidth
|
||||
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
|
||||
int max_frag_len;
|
||||
int max_chain_skip, max_chain_iter;
|
||||
int min_cnt; // min number of minimizers on each chain
|
||||
int min_chain_score; // min chaining score
|
||||
float chain_gap_scale;
|
||||
float chain_skip_scale;
|
||||
int rmq_size_cap, rmq_inner_dist;
|
||||
int rmq_rescue_size;
|
||||
float rmq_rescue_ratio;
|
||||
|
||||
float mask_level;
|
||||
int mask_len;
|
||||
float pri_ratio;
|
||||
int best_n; // top best_n chains are subjected to DP alignment
|
||||
float mask_level;
|
||||
int mask_len;
|
||||
float pri_ratio;
|
||||
int best_n; // top best_n chains are subjected to DP alignment
|
||||
|
||||
float alt_drop;
|
||||
float alt_drop;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int transition; // transition mismatch score (A:G, C:T)
|
||||
int sc_ambi; // score when one or both bases are "N"
|
||||
int noncan; // cost of non-canonical splicing sites
|
||||
int junc_bonus;
|
||||
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
|
||||
int end_bonus;
|
||||
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
|
||||
int min_ksw_len;
|
||||
int anchor_ext_len, anchor_ext_shift;
|
||||
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int transition; // transition mismatch score (A:G, C:T)
|
||||
int sc_ambi; // score when one or both bases are "N"
|
||||
int noncan; // cost of non-canonical splicing sites
|
||||
int junc_bonus;
|
||||
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
|
||||
int end_bonus;
|
||||
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
|
||||
int min_ksw_len;
|
||||
int anchor_ext_len, anchor_ext_shift;
|
||||
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
|
||||
|
||||
int rank_min_len;
|
||||
float rank_frac;
|
||||
int rank_min_len;
|
||||
float rank_frac;
|
||||
|
||||
int pe_ori, pe_bonus;
|
||||
int pe_ori, pe_bonus;
|
||||
|
||||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||
float q_occ_frac;
|
||||
int32_t min_mid_occ, max_mid_occ;
|
||||
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
||||
int32_t max_occ, max_max_occ, occ_dist;
|
||||
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
|
||||
int64_t max_sw_mat;
|
||||
int64_t cap_kalloc;
|
||||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||
float q_occ_frac;
|
||||
int32_t min_mid_occ, max_mid_occ;
|
||||
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
||||
int32_t max_occ, max_max_occ, occ_dist;
|
||||
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
|
||||
int64_t max_sw_mat;
|
||||
int64_t cap_kalloc;
|
||||
|
||||
const char *split_prefix;
|
||||
} mm_mapopt_t;
|
||||
const char *split_prefix;
|
||||
} mm_mapopt_t;
|
||||
|
||||
// index reader
|
||||
typedef struct {
|
||||
int is_idx, n_parts;
|
||||
int64_t idx_size;
|
||||
mm_idxopt_t opt;
|
||||
FILE *fp_out;
|
||||
union {
|
||||
struct mm_bseq_file_s *seq;
|
||||
FILE *idx;
|
||||
} fp;
|
||||
} mm_idx_reader_t;
|
||||
// index reader
|
||||
typedef struct
|
||||
{
|
||||
int is_idx, n_parts;
|
||||
int64_t idx_size;
|
||||
mm_idxopt_t opt;
|
||||
FILE *fp_out;
|
||||
union
|
||||
{
|
||||
struct mm_bseq_file_s *seq;
|
||||
FILE *idx;
|
||||
} fp;
|
||||
} mm_idx_reader_t;
|
||||
|
||||
// memory buffer for thread-local storage during mapping
|
||||
struct mm_tbuf_s {
|
||||
void *km;
|
||||
int rep_len, frag_gap;
|
||||
};
|
||||
// memory buffer for thread-local storage during mapping
|
||||
struct mm_tbuf_s
|
||||
{
|
||||
void *km;
|
||||
int rep_len, frag_gap;
|
||||
};
|
||||
|
||||
typedef struct mm_tbuf_s mm_tbuf_t;
|
||||
typedef struct mm_tbuf_s mm_tbuf_t;
|
||||
|
||||
// global variables
|
||||
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
|
||||
extern double mm_realtime0; // wall-clock timer
|
||||
// global variables
|
||||
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
|
||||
extern double mm_realtime0; // wall-clock timer
|
||||
|
||||
/**
|
||||
* Set default or preset parameters
|
||||
*
|
||||
* @param preset NULL to set all parameters as default; otherwise apply preset to affected parameters
|
||||
* @param io pointer to indexing parameters
|
||||
* @param mo pointer to mapping parameters
|
||||
*
|
||||
* @return 0 if success; -1 if _present_ unknown
|
||||
*/
|
||||
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
|
||||
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);
|
||||
/**
|
||||
* Set default or preset parameters
|
||||
*
|
||||
* @param preset NULL to set all parameters as default; otherwise apply preset to affected parameters
|
||||
* @param io pointer to indexing parameters
|
||||
* @param mo pointer to mapping parameters
|
||||
*
|
||||
* @return 0 if success; -1 if _present_ unknown
|
||||
*/
|
||||
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
|
||||
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);
|
||||
|
||||
/**
|
||||
* Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
|
||||
*
|
||||
* If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
|
||||
* more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
|
||||
* occurrence.
|
||||
*
|
||||
* @param opt mapping parameters
|
||||
* @param mi minimap2 index
|
||||
*/
|
||||
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
|
||||
/**
|
||||
* Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
|
||||
*
|
||||
* If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
|
||||
* more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
|
||||
* occurrence.
|
||||
*
|
||||
* @param opt mapping parameters
|
||||
* @param mi minimap2 index
|
||||
*/
|
||||
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
|
||||
|
||||
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
|
||||
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
|
||||
|
||||
/**
|
||||
* Initialize an index reader
|
||||
*
|
||||
* @param fn index or fasta/fastq file name (this function tests the file type)
|
||||
* @param opt indexing parameters
|
||||
* @param fn_out if not NULL, write built index to this file
|
||||
*
|
||||
* @return an index reader on success; NULL if fail to open _fn_
|
||||
*/
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
|
||||
/**
|
||||
* Initialize an index reader
|
||||
*
|
||||
* @param fn index or fasta/fastq file name (this function tests the file type)
|
||||
* @param opt indexing parameters
|
||||
* @param fn_out if not NULL, write built index to this file
|
||||
*
|
||||
* @return an index reader on success; NULL if fail to open _fn_
|
||||
*/
|
||||
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
|
||||
|
||||
/**
|
||||
* Read/build an index
|
||||
*
|
||||
* If the input file is an index file, this function reads one part of the
|
||||
* index and returns. If the input file is a sequence file (fasta or fastq),
|
||||
* this function constructs the index for about mm_idxopt_t::batch_size bases.
|
||||
* Importantly, for a huge collection of sequences, this function may only
|
||||
* return an index for part of sequences. It needs to be repeatedly called
|
||||
* to traverse the entire index/sequence file.
|
||||
*
|
||||
* @param r index reader
|
||||
* @param n_threads number of threads for constructing index
|
||||
*
|
||||
* @return an index on success; NULL if reaching the end of the input file
|
||||
*/
|
||||
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
|
||||
/**
|
||||
* Read/build an index
|
||||
*
|
||||
* If the input file is an index file, this function reads one part of the
|
||||
* index and returns. If the input file is a sequence file (fasta or fastq),
|
||||
* this function constructs the index for about mm_idxopt_t::batch_size bases.
|
||||
* Importantly, for a huge collection of sequences, this function may only
|
||||
* return an index for part of sequences. It needs to be repeatedly called
|
||||
* to traverse the entire index/sequence file.
|
||||
*
|
||||
* @param r index reader
|
||||
* @param n_threads number of threads for constructing index
|
||||
*
|
||||
* @return an index on success; NULL if reaching the end of the input file
|
||||
*/
|
||||
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
|
||||
|
||||
/**
|
||||
* Destroy/deallocate an index reader
|
||||
*
|
||||
* @param r index reader
|
||||
*/
|
||||
void mm_idx_reader_close(mm_idx_reader_t *r);
|
||||
/**
|
||||
* Destroy/deallocate an index reader
|
||||
*
|
||||
* @param r index reader
|
||||
*/
|
||||
void mm_idx_reader_close(mm_idx_reader_t *r);
|
||||
|
||||
int mm_idx_reader_eof(const mm_idx_reader_t *r);
|
||||
int mm_idx_reader_eof(const mm_idx_reader_t *r);
|
||||
|
||||
/**
|
||||
* Check whether the file contains a minimap2 index
|
||||
*
|
||||
* @param fn file name
|
||||
*
|
||||
* @return the file size if fn is an index file; 0 if fn is not.
|
||||
*/
|
||||
int64_t mm_idx_is_idx(const char *fn);
|
||||
/**
|
||||
* Check whether the file contains a minimap2 index
|
||||
*
|
||||
* @param fn file name
|
||||
*
|
||||
* @return the file size if fn is an index file; 0 if fn is not.
|
||||
*/
|
||||
int64_t mm_idx_is_idx(const char *fn);
|
||||
|
||||
/**
|
||||
* Load a part of an index
|
||||
*
|
||||
* Given a uni-part index, this function loads the entire index into memory.
|
||||
* Given a multi-part index, it loads one part only and places the file pointer
|
||||
* at the end of that part.
|
||||
*
|
||||
* @param fp pointer to FILE object
|
||||
*
|
||||
* @return minimap2 index read from fp
|
||||
*/
|
||||
mm_idx_t *mm_idx_load(FILE *fp);
|
||||
/**
|
||||
* Load a part of an index
|
||||
*
|
||||
* Given a uni-part index, this function loads the entire index into memory.
|
||||
* Given a multi-part index, it loads one part only and places the file pointer
|
||||
* at the end of that part.
|
||||
*
|
||||
* @param fp pointer to FILE object
|
||||
*
|
||||
* @return minimap2 index read from fp
|
||||
*/
|
||||
mm_idx_t *mm_idx_load(FILE *fp);
|
||||
|
||||
/**
|
||||
* Append an index (or one part of a full index) to file
|
||||
*
|
||||
* @param fp pointer to FILE object
|
||||
* @param mi minimap2 index
|
||||
*/
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
|
||||
/**
|
||||
* Append an index (or one part of a full index) to file
|
||||
*
|
||||
* @param fp pointer to FILE object
|
||||
* @param mi minimap2 index
|
||||
*/
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
|
||||
|
||||
/**
|
||||
* Create an index from strings in memory
|
||||
*
|
||||
* @param w minimizer window size
|
||||
* @param k minimizer k-mer size
|
||||
* @param is_hpc use HPC k-mer if true
|
||||
* @param bucket_bits number of bits for the first level of the hash table
|
||||
* @param n number of sequences
|
||||
* @param seq sequences in A/C/G/T
|
||||
* @param name sequence names; could be NULL
|
||||
*
|
||||
* @return minimap2 index
|
||||
*/
|
||||
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);
|
||||
/**
|
||||
* Create an index from strings in memory
|
||||
*
|
||||
* @param w minimizer window size
|
||||
* @param k minimizer k-mer size
|
||||
* @param is_hpc use HPC k-mer if true
|
||||
* @param bucket_bits number of bits for the first level of the hash table
|
||||
* @param n number of sequences
|
||||
* @param seq sequences in A/C/G/T
|
||||
* @param name sequence names; could be NULL
|
||||
*
|
||||
* @return minimap2 index
|
||||
*/
|
||||
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);
|
||||
|
||||
/**
|
||||
* Print index statistics to stderr
|
||||
*
|
||||
* @param mi minimap2 index
|
||||
*/
|
||||
void mm_idx_stat(const mm_idx_t *idx);
|
||||
/**
|
||||
* Print index statistics to stderr
|
||||
*
|
||||
* @param mi minimap2 index
|
||||
*/
|
||||
void mm_idx_stat(const mm_idx_t *idx);
|
||||
|
||||
/**
|
||||
* Destroy/deallocate an index
|
||||
*
|
||||
* @param r minimap2 index
|
||||
*/
|
||||
void mm_idx_destroy(mm_idx_t *mi);
|
||||
/**
|
||||
* Destroy/deallocate an index
|
||||
*
|
||||
* @param r minimap2 index
|
||||
*/
|
||||
void mm_idx_destroy(mm_idx_t *mi);
|
||||
|
||||
/**
|
||||
* Initialize a thread-local buffer for mapping
|
||||
*
|
||||
* Each mapping thread requires a buffer specific to the thread (see mm_map()
|
||||
* below). The primary purpose of this buffer is to reduce frequent heap
|
||||
* allocations across threads. A buffer shall not be used by two or more
|
||||
* threads.
|
||||
*
|
||||
* @return pointer to a thread-local buffer
|
||||
*/
|
||||
mm_tbuf_t *mm_tbuf_init(void);
|
||||
/**
|
||||
* Initialize a thread-local buffer for mapping
|
||||
*
|
||||
* Each mapping thread requires a buffer specific to the thread (see mm_map()
|
||||
* below). The primary purpose of this buffer is to reduce frequent heap
|
||||
* allocations across threads. A buffer shall not be used by two or more
|
||||
* threads.
|
||||
*
|
||||
* @return pointer to a thread-local buffer
|
||||
*/
|
||||
mm_tbuf_t *mm_tbuf_init(void);
|
||||
|
||||
/**
|
||||
* Destroy/deallocate a thread-local buffer for mapping
|
||||
*
|
||||
* @param b the buffer
|
||||
*/
|
||||
void mm_tbuf_destroy(mm_tbuf_t *b);
|
||||
/**
|
||||
* Destroy/deallocate a thread-local buffer for mapping
|
||||
*
|
||||
* @param b the buffer
|
||||
*/
|
||||
void mm_tbuf_destroy(mm_tbuf_t *b);
|
||||
|
||||
void *mm_tbuf_get_km(mm_tbuf_t *b);
|
||||
void *mm_tbuf_get_km(mm_tbuf_t *b);
|
||||
|
||||
/**
|
||||
* Align a query sequence against an index
|
||||
*
|
||||
* This function possibly finds multiple alignments of the query sequence.
|
||||
* The returned array and the mm_reg1_t::p field of each element are allocated
|
||||
* with malloc().
|
||||
*
|
||||
* @param mi minimap2 index
|
||||
* @param l_seq length of the query sequence
|
||||
* @param seq the query sequence
|
||||
* @param n_regs number of hits (out)
|
||||
* @param b thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
|
||||
* @param opt mapping parameters
|
||||
* @param name query name, used for all-vs-all overlapping and debugging
|
||||
*
|
||||
* @return an array of hits which need to be deallocated with free() together
|
||||
* with mm_reg1_t::p of each element. The size is written to _n_regs_.
|
||||
*/
|
||||
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
|
||||
/**
|
||||
* Align a query sequence against an index
|
||||
*
|
||||
* This function possibly finds multiple alignments of the query sequence.
|
||||
* The returned array and the mm_reg1_t::p field of each element are allocated
|
||||
* with malloc().
|
||||
*
|
||||
* @param mi minimap2 index
|
||||
* @param l_seq length of the query sequence
|
||||
* @param seq the query sequence
|
||||
* @param n_regs number of hits (out)
|
||||
* @param b thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
|
||||
* @param opt mapping parameters
|
||||
* @param name query name, used for all-vs-all overlapping and debugging
|
||||
*
|
||||
* @return an array of hits which need to be deallocated with free() together
|
||||
* with mm_reg1_t::p of each element. The size is written to _n_regs_.
|
||||
*/
|
||||
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
|
||||
|
||||
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);
|
||||
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);
|
||||
|
||||
/**
|
||||
* Align a fasta/fastq file and print alignments to stdout
|
||||
*
|
||||
* @param idx minimap2 index
|
||||
* @param fn fasta/fastq file name
|
||||
* @param opt mapping parameters
|
||||
* @param n_threads number of threads
|
||||
*
|
||||
* @return 0 on success; -1 if _fn_ can't be read
|
||||
*/
|
||||
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
|
||||
/**
|
||||
* Align a fasta/fastq file and print alignments to stdout
|
||||
*
|
||||
* @param idx minimap2 index
|
||||
* @param fn fasta/fastq file name
|
||||
* @param opt mapping parameters
|
||||
* @param n_threads number of threads
|
||||
*
|
||||
* @return 0 on success; -1 if _fn_ can't be read
|
||||
*/
|
||||
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
|
||||
|
||||
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
|
||||
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
|
||||
|
||||
/**
|
||||
* Generate the cs tag (new in 2.12)
|
||||
*
|
||||
* @param km memory blocks; set to NULL if unsure
|
||||
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
|
||||
* @param max_len max length of the buffer; typically set to 0 on the first call
|
||||
* @param mi index
|
||||
* @param r alignment
|
||||
* @param seq query sequence
|
||||
* @param no_iden true to use : instead of =
|
||||
*
|
||||
* @return the length of cs
|
||||
*/
|
||||
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
|
||||
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
|
||||
/**
|
||||
* Generate the cs tag (new in 2.12)
|
||||
*
|
||||
* @param km memory blocks; set to NULL if unsure
|
||||
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
|
||||
* @param max_len max length of the buffer; typically set to 0 on the first call
|
||||
* @param mi index
|
||||
* @param r alignment
|
||||
* @param seq query sequence
|
||||
* @param no_iden true to use : instead of =
|
||||
*
|
||||
* @return the length of cs
|
||||
*/
|
||||
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
|
||||
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
|
||||
|
||||
// query sequence name and sequence in the minimap2 index
|
||||
int mm_idx_index_name(mm_idx_t *mi);
|
||||
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
// query sequence name and sequence in the minimap2 index
|
||||
int mm_idx_index_name(mm_idx_t *mi);
|
||||
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
|
||||
int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
|
||||
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
|
||||
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
|
||||
int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
|
||||
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
|
||||
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
|
||||
|
||||
// deprecated APIs for backward compatibility
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
||||
// deprecated APIs for backward compatibility
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
102
seed.c
102
seed.c
|
|
@ -6,13 +6,16 @@ void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
|
|||
{
|
||||
mm128_t *a;
|
||||
size_t i, j, st;
|
||||
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
|
||||
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0)
|
||||
return;
|
||||
a = Kmalloc(km, mm128_t, mv->n);
|
||||
for (i = 0; i < mv->n; ++i)
|
||||
a[i].x = mv->a[i].x, a[i].y = i;
|
||||
radix_sort_128x(a, a + mv->n);
|
||||
for (st = 0, i = 1; i <= mv->n; ++i) {
|
||||
if (i == mv->n || a[i].x != a[st].x) {
|
||||
for (st = 0, i = 1; i <= mv->n; ++i)
|
||||
{
|
||||
if (i == mv->n || a[i].x != a[st].x)
|
||||
{
|
||||
int32_t cnt = i - st;
|
||||
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
|
||||
for (j = st; j < i; ++j)
|
||||
|
|
@ -32,20 +35,24 @@ mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv,
|
|||
mm_seed_t *m;
|
||||
size_t i;
|
||||
int32_t k;
|
||||
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
||||
for (i = k = 0; i < mv->n; ++i) {
|
||||
m = (mm_seed_t *)kmalloc(km, mv->n * sizeof(mm_seed_t)); // 为每一个minimizer开辟一个mm_seed_t
|
||||
for (i = k = 0; i < mv->n; ++i)
|
||||
{
|
||||
const uint64_t *cr;
|
||||
mm_seed_t *q;
|
||||
mm128_t *p = &mv->a[i];
|
||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||
int t;
|
||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
||||
if (t == 0) continue;
|
||||
int t; // t表示hash值的低32位,表示啥?
|
||||
cr = mm_idx_get(mi, p->x >> 8, &t); // cr是hash值的高32位,代表位置
|
||||
if (t == 0)
|
||||
continue;
|
||||
q = &m[k++];
|
||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||
q->is_tandem = q->flt = 0;
|
||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
||||
if (i > 0 && p->x >> 8 == mv->a[i - 1].x >> 8)
|
||||
q->is_tandem = 1;
|
||||
if (i < mv->n - 1 && p->x >> 8 == mv->a[i + 1].x >> 8)
|
||||
q->is_tandem = 1;
|
||||
}
|
||||
*n_m_ = k;
|
||||
return m;
|
||||
|
|
@ -55,37 +62,48 @@ mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv,
|
|||
|
||||
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
|
||||
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
|
||||
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
|
||||
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
|
||||
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t *);
|
||||
extern void ks_heapmake_uint64_t(size_t n, uint64_t *);
|
||||
int32_t i, last0, m;
|
||||
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
||||
|
||||
if (n == 0 || n == 1) return;
|
||||
if (n == 0 || n == 1)
|
||||
return;
|
||||
for (i = m = 0; i < n; ++i)
|
||||
if (a[i].n > max_occ) ++m;
|
||||
if (m == 0) return; // no high-frequency k-mers; do nothing
|
||||
for (i = 0, last0 = -1; i <= n; ++i) {
|
||||
if (i == n || a[i].n <= max_occ) {
|
||||
if (i - last0 > 1) {
|
||||
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
|
||||
int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
|
||||
if (a[i].n > max_occ)
|
||||
++m;
|
||||
if (m == 0)
|
||||
return; // no high-frequency k-mers; do nothing
|
||||
for (i = 0, last0 = -1; i <= n; ++i)
|
||||
{
|
||||
if (i == n || a[i].n <= max_occ)
|
||||
{
|
||||
if (i - last0 > 1)
|
||||
{
|
||||
int32_t ps = last0 < 0 ? 0 : (uint32_t)a[last0].q_pos >> 1;
|
||||
int32_t pe = i == n ? len : (uint32_t)a[i].q_pos >> 1;
|
||||
int32_t j, k, st = last0 + 1, en = i;
|
||||
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
||||
if (max_high_occ > 0) {
|
||||
if (max_high_occ > 0)
|
||||
{
|
||||
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
||||
max_high_occ = MAX_MAX_HIGH_OCC;
|
||||
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
||||
b[k] = (uint64_t)a[j].n<<32 | j;
|
||||
b[k] = (uint64_t)a[j].n << 32 | j;
|
||||
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
||||
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
||||
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
|
||||
b[0] = (uint64_t)a[j].n<<32 | j;
|
||||
for (; j < en; ++j)
|
||||
{ // if there are more, choose top max_high_occ
|
||||
if (a[j].n < (int32_t)(b[0] >> 32))
|
||||
{ // then update the heap
|
||||
b[0] = (uint64_t)a[j].n << 32 | j;
|
||||
ks_heapdown_uint64_t(0, k, b);
|
||||
}
|
||||
}
|
||||
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
||||
for (j = 0; j < k; ++j)
|
||||
a[(uint32_t)b[j]].flt = 1;
|
||||
}
|
||||
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
||||
for (j = st; j < en; ++j)
|
||||
a[j].flt ^= 1;
|
||||
for (j = st; j < en; ++j)
|
||||
if (a[j].n > max_max_occ)
|
||||
a[j].flt = 1;
|
||||
|
|
@ -101,27 +119,37 @@ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int ma
|
|||
size_t i;
|
||||
mm_seed_t *m;
|
||||
*n_mini_pos = 0;
|
||||
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||
*mini_pos = (uint64_t *)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||
m = mm_seed_collect_all(km, mi, mv, &n_m0);
|
||||
if (dist > 0 && max_max_occ > max_occ) {
|
||||
if (dist > 0 && max_max_occ > max_occ)
|
||||
{
|
||||
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i < n_m0; ++i)
|
||||
if (m[i].n > max_occ)
|
||||
m[i].flt = 1;
|
||||
}
|
||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
|
||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i)
|
||||
{
|
||||
mm_seed_t *q = &m[i];
|
||||
//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
||||
if (q->flt) {
|
||||
// fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
||||
if (q->flt)
|
||||
{
|
||||
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
||||
if (st > rep_en) {
|
||||
if (st > rep_en)
|
||||
{
|
||||
*rep_len += rep_en - rep_st;
|
||||
rep_st = st, rep_en = en;
|
||||
} else rep_en = en;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
rep_en = en;
|
||||
}
|
||||
else
|
||||
{
|
||||
*n_a += q->n;
|
||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
|
||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
|
||||
m[n_m++] = *q;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
126
sketch.c
126
sketch.c
|
|
@ -7,23 +7,22 @@
|
|||
#include "mmpriv.h"
|
||||
|
||||
unsigned char seq_nt4_table[256] = {
|
||||
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
||||
};
|
||||
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
|
||||
|
||||
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
||||
{
|
||||
|
|
@ -37,7 +36,8 @@ static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
|||
return key;
|
||||
}
|
||||
|
||||
typedef struct { // a simplified version of kdq
|
||||
typedef struct
|
||||
{ // a simplified version of kdq
|
||||
int front, count;
|
||||
int a[32];
|
||||
} tiny_queue_t;
|
||||
|
|
@ -50,7 +50,8 @@ static inline void tq_push(tiny_queue_t *q, int x)
|
|||
static inline int tq_shift(tiny_queue_t *q)
|
||||
{
|
||||
int x;
|
||||
if (q->count == 0) return -1;
|
||||
if (q->count == 0)
|
||||
return -1;
|
||||
x = q->a[q->front++];
|
||||
q->front &= 0x1f;
|
||||
--q->count;
|
||||
|
|
@ -76,24 +77,28 @@ static inline int tq_shift(tiny_queue_t *q)
|
|||
*/
|
||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
|
||||
{
|
||||
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
|
||||
uint64_t shift1 = 2 * (k - 1), mask = (1ULL << 2 * k) - 1, kmer[2] = {0, 0};
|
||||
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
||||
mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
|
||||
mm128_t buf[256], min = {UINT64_MAX, UINT64_MAX};
|
||||
tiny_queue_t tq;
|
||||
|
||||
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
||||
memset(buf, 0xff, w * 16);
|
||||
memset(&tq, 0, sizeof(tiny_queue_t));
|
||||
kv_resize(mm128_t, km, *p, p->n + len/w);
|
||||
kv_resize(mm128_t, km, *p, p->n + len / w); // 扩充p,将新生成len/w个minimizer
|
||||
|
||||
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
||||
for (i = l = buf_pos = min_pos = 0; i < len; ++i)
|
||||
{
|
||||
int c = seq_nt4_table[(uint8_t)str[i]];
|
||||
mm128_t info = { UINT64_MAX, UINT64_MAX };
|
||||
if (c < 4) { // not an ambiguous base
|
||||
mm128_t info = {UINT64_MAX, UINT64_MAX};
|
||||
if (c < 4)
|
||||
{ // not an ambiguous base
|
||||
int z;
|
||||
if (is_hpc) {
|
||||
if (is_hpc)
|
||||
{
|
||||
int skip_len = 1;
|
||||
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
|
||||
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c)
|
||||
{
|
||||
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
||||
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
||||
break;
|
||||
|
|
@ -101,42 +106,63 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
|
|||
}
|
||||
tq_push(&tq, skip_len);
|
||||
kmer_span += skip_len;
|
||||
if (tq.count > k) kmer_span -= tq_shift(&tq);
|
||||
} else kmer_span = l + 1 < k? l + 1 : k;
|
||||
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
||||
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
|
||||
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
||||
z = kmer[0] < kmer[1]? 0 : 1; // strand // kmer的strand到底是什么意思?为什么通过比较就能确定正反?
|
||||
if (tq.count > k)
|
||||
kmer_span -= tq_shift(&tq);
|
||||
}
|
||||
else
|
||||
kmer_span = l + 1 < k ? l + 1 : k;
|
||||
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
||||
kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer
|
||||
if (kmer[0] == kmer[1])
|
||||
continue; // skip "symmetric k-mers" as we don't know it strand
|
||||
z = kmer[0] < kmer[1] ? 0 : 1; // strand // 选取小的那个kmer,kmer的strand到底是什么意思?为什么通过比较就能确定正反?
|
||||
++l;
|
||||
if (l >= k && kmer_span < 256) {
|
||||
if (l >= k && kmer_span < 256)
|
||||
{
|
||||
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
||||
info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
||||
info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z;
|
||||
}
|
||||
} else l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
||||
}
|
||||
else
|
||||
l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
||||
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
||||
if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
|
||||
if (l == w + k - 1 && min.x != UINT64_MAX)
|
||||
{ // special case for the first window - because identical k-mers are not stored yet
|
||||
for (j = buf_pos + 1; j < w; ++j)
|
||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
if (min.x == buf[j].x && buf[j].y != min.y)
|
||||
kv_push(mm128_t, km, *p, buf[j]);
|
||||
for (j = 0; j < buf_pos; ++j)
|
||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
if (min.x == buf[j].x && buf[j].y != min.y)
|
||||
kv_push(mm128_t, km, *p, buf[j]);
|
||||
}
|
||||
if (info.x <= min.x) { // a new minimum; then write the old min
|
||||
if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
||||
if (info.x <= min.x)
|
||||
{ // a new minimum; then write the old min
|
||||
if (l >= w + k && min.x != UINT64_MAX)
|
||||
kv_push(mm128_t, km, *p, min);
|
||||
min = info, min_pos = buf_pos;
|
||||
} else if (buf_pos == min_pos) { // old min has moved outside the window
|
||||
if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
||||
}
|
||||
else if (buf_pos == min_pos)
|
||||
{ // old min has moved outside the window
|
||||
if (l >= w + k - 1 && min.x != UINT64_MAX)
|
||||
kv_push(mm128_t, km, *p, min);
|
||||
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
||||
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
||||
if (min.x >= buf[j].x)
|
||||
min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
||||
for (j = 0; j <= buf_pos; ++j)
|
||||
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
|
||||
if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
|
||||
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
if (min.x >= buf[j].x)
|
||||
min = buf[j], min_pos = j; // 如果有多个min相同,取离当前位置最近的
|
||||
if (l >= w + k - 1 && min.x != UINT64_MAX) // 往回找相同值的kmer,放进p里
|
||||
{ // write identical k-mers
|
||||
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
||||
if (min.x == buf[j].x && min.y != buf[j].y)
|
||||
kv_push(mm128_t, km, *p, buf[j]);
|
||||
for (j = 0; j <= buf_pos; ++j)
|
||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
if (min.x == buf[j].x && min.y != buf[j].y)
|
||||
kv_push(mm128_t, km, *p, buf[j]);
|
||||
}
|
||||
}
|
||||
if (++buf_pos == w) buf_pos = 0;
|
||||
if (++buf_pos == w)
|
||||
buf_pos = 0;
|
||||
}
|
||||
if (min.x != UINT64_MAX)
|
||||
kv_push(mm128_t, km, *p, min);
|
||||
|
|
|
|||
Loading…
Reference in New Issue