hybrid index实现seeding过程的初始代码

This commit is contained in:
zzh 2025-06-16 10:11:29 +08:00
parent b50360bb48
commit fb5279120b
17 changed files with 302 additions and 83 deletions

5
.vscode/launch.json vendored
View File

@ -18,11 +18,10 @@
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta",
"~/data/fastq/dataset/na12878_wes_144/1w_1.fq",
"~/data/fastq/dataset/na12878_wes_144/1w_2.fq",
"~/data/dataset/real/D3/n1.fq",
"~/data/dataset/real/D3/n2.fq",
"-o",
"/dev/null",
"-Z"
],
"cwd": "${workspaceFolder}", //
},

View File

@ -42,6 +42,8 @@
"ertindex.h": "c",
"ertseeding.h": "c",
"algorithm": "c",
"filesystem": "c"
"filesystem": "c",
"chrono": "c",
"queue": "c"
}
}

View File

@ -1,5 +1,5 @@
CC= gcc
CFLAGS= -g -Wall -Wno-unused-function -mavx2 #-O2
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF

2
bwa.h
View File

@ -95,7 +95,7 @@ extern "C" {
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size);
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size, int num_thread);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);

View File

@ -684,6 +684,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
uint8_t *rseq = 0;
uint64_t *srt;
smem_aux_t *aux = (smem_aux_t*)buf;
kvec_t(int) qlenv;
kv_init(qlenv);
if (c->n == 0) return;
// get the max possible span
@ -763,6 +765,10 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name);
if (s->qbeg) { // left extension
// s-qbeg是seed左侧需要扩展的长度
//tdat[s->qbeg][tid]++;
kv_push(int, qlenv, s->qbeg);
fprintf(stderr, "%d,\n", s->qbeg);
int qle, tle, gtle, gscore;
tmp = s->rbeg - rmax[0];
#ifndef USE_AVX2
@ -807,6 +813,9 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
qe = s->qbeg + s->len;
re = s->rbeg + s->len - rmax[0];
kv_push(int, qlenv, l_query - qe);
fprintf(stderr, "%d,\n", l_query - qe);
// tdat[l_query - qe][tid]++;
assert(re >= 0);
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
@ -848,6 +857,21 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
a->frac_rep = c->frac_rep;
}
tdat[qlenv.n][tid] ++;
// 计算标准差
double avgl = 0, stdl = 0;
for (i=0; i<qlenv.n; ++i) {
avgl += qlenv.a[i];
}
avgl = avgl / qlenv.n;
for (i=0; i<qlenv.n; ++i) {
stdl += pow((avgl - qlenv.a[i]), 2);
}
stdl = sqrt(stdl);
kv_destroy(qlenv);
free(srt);
free(rseq);
}
@ -1304,16 +1328,20 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
// 1. first pass: find all SMEMs
// goto third_seed;
PROF_START(seed_1);
while (x < len) {
__sync_fetch_and_add(&gdat[0], 1);
while (x < len) {
if (seq[x] < 4) {
start_flag = 0;
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
//x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
x = fmt_smem_new(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
max_seed_len = MAX(max_seed_len, slen);
if (slen >= opt->min_seed_len) {
kv_push(bwtintv_t, smem->mem, *p);
//fprintf(gf[0], "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
//fprintf(stderr, "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
}
}
// break; // for test
@ -1322,30 +1350,40 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
if (start_flag) ++start_N_num;
}
}
PROF_END(tprof[T_SEED_1][tid], seed_1);
// && smem->mem.a[0].x[2] == 1
if (max_seed_len == len - start_N_num && smem->mem.n > 0 && smem->mem.a[0].x[2] == 1) {
__sync_fetch_and_add(&gdat[1], 1);
}
PROF_END(tprof[T_SEED_1][tid], seed_1);
//goto third_seed;
#ifdef FILTER_FULL_MATCH
if (max_seed_len == len - start_N_num) {
goto collect_intv_end;
}
#endif
// 2. second pass: find MEMs inside a long SMEM
PROF_START(seed_2);
//fprintf(gf[0], "second %ld\n", ++debug_num);
// 2. second pass: find MEMs inside a long SMEM
// fprintf(stderr, "%ld\n", debug_num++);
PROF_START(seed_2);
old_n = smem->mem.n;
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &smem->mem.a[k];
int start = p->info>>32, end = (int32_t)p->info;
if (end - start < split_len || p->x[2] > opt->split_width) continue;
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
for (i = 0; i < a->mem1.n; ++i) {
//fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
fmt_smem_new(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
// fprintf(stderr, "%ld\n", debug_num++);
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info >> 32);
if (slen >= opt->min_seed_len) {
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
}
//fprintf(gf[0], "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
}
}
}
PROF_END(tprof[T_SEED_2][tid], seed_2);
//fprintf(gf[0], "\n");
PROF_END(tprof[T_SEED_2][tid], seed_2);
//third_seed:
// 3. third pass: LAST-like
@ -1358,7 +1396,8 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
if (m.x[2] > 0) {
kv_push(bwtintv_t, smem->mem, m);
}
//fprintf(stderr, "seed_3: %ld %d %ld %ld\n", m.info >> 32, (uint32_t)m.info, (-(m.info >> 32) + (uint32_t)m.info), m.x[2]);
}
} else ++x;
}
}
@ -1369,6 +1408,11 @@ collect_intv_end:
#endif
// sort
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
// for (i = 0; i < smem->mem.n; ++i) {
// bwtintv_t *p = &smem->mem.a[i];
// fprintf(gf[0], "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
// }
// fprintf(gf[0], "\n");
}
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)

18
bwt.c
View File

@ -62,7 +62,8 @@ static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inver
// 设置某一行的排序值sa的索引有效值从1开始0设置为-1, 小端模式)
void inline bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
{
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组共享33个字节
//const bwtint_t block_idx = (k >> 3) * 33;
const bwtint_t block_idx = ((k >> 3) << 5) + (k>>3); // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx);
@ -73,13 +74,20 @@ void inline bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
// 获取某一行的排序值(小端模式)
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k)
{
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
// const bwtint_t block_idx = (k >> 3) * 33;
const bwtint_t block_idx = ((k >> 3) << 5) + (k >> 3); // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t val = *(bwtint_t *)(sa_arr + start_byte_idx);
val = (val >> val_idx_in_block) & 8589934591;
return val;
}
bwtint_t bwt_get_sa_2(uint8_t *sa_arr, bwtint_t k) {
const bwtint_t start_byte = ((k << 5) + k) >> 3; // 存储这个sa数据的起始字节
bwtint_t val = *(bwtint_t *)(sa_arr + start_byte);
val = (val >> (k & 7)) & 8589934591;
return val;
}
// 获取kmer的fmt匹配信息
inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos)
@ -147,7 +155,7 @@ inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit
}
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_byte_sa(bwt_t *bwt, int intv)
void bwt_cal_byte_sa(bwt_t *bwt, int intv, int num_thread)
{
bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引sa表示位置
double tmp_time, elapsed_time;
@ -184,7 +192,7 @@ void bwt_cal_byte_sa(bwt_t *bwt, int intv)
}
if (isa % intv == 0) bwt_set_sa(bwt->byte_sa, isa / intv, sa);
// bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题set_sa那里已经修正了
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // 2的33次方-1 before this line, bwt->sa[0] = bwt->seq_len
}
// bwt->bwt and bwt->occ must be precalculated

4
bwt.h
View File

@ -103,7 +103,7 @@ typedef struct {
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)/* 8字节对齐*/
/* For general OCC_INTERVAL, the following is correct:
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
@ -137,7 +137,7 @@ extern "C" {
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_cal_byte_sa(bwt_t *bwt, int intv);
void bwt_cal_byte_sa(bwt_t *bwt, int intv, int num_thread);
void bwt_bwtupdate_core(bwt_t *bwt);

View File

@ -261,10 +261,14 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command
{
bwt_t *bwt;
int c, sa_intv = 32;
int c, sa_intv = 32, num_thread = 1;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i': sa_intv = atoi(optarg); break;
case 't':
num_thread = atoi(optarg);
if (num_thread < 1) num_thread = 1;
break;
default: return 1;
}
}
@ -273,7 +277,7 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_cal_byte_sa(bwt, sa_intv);
bwt_cal_byte_sa(bwt, sa_intv, num_thread);
bwt_dump_byte_sa(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
@ -389,12 +393,12 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(prefix, argv[optind]);
if (is_64) strcat(prefix, ".64");
}
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
bwa_idx_build(argv[optind], prefix, algo_type, block_size, num_threads);
free(prefix);
return 0;
}
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size)
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size, int num_threads)
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
@ -475,7 +479,7 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
// create byte sa
t = clock();
bwt_cal_byte_sa(bwt, 4);
bwt_cal_byte_sa(bwt, 4, num_threads);
strcpy(str, prefix); strcat(str, ".bytesa");
bwt_dump_byte_sa(str, bwt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);

View File

@ -13,6 +13,8 @@ FILE *gf[4] = {0},
*gft[4] = {0},
*gfi[4] = {0};
uint64_t debug_num = 0;
int open_qti_files()
{
char fn[1024] = {0};

View File

@ -6,6 +6,7 @@
Date : 2024/04/09
***********************************************************************************************/
#include <stdio.h>
#include <stdint.h>
////////////////// for debug and test //////////////////////////
@ -21,6 +22,8 @@
extern FILE *gf[4], *gfq[4], *gft[4], *gfi[4];
extern uint64_t debug_num;
int open_qti_files();
int open_debug_files();

View File

@ -344,6 +344,7 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
int main_mem(int argc, char *argv[])
{
open_debug_files();
#ifdef DEBUG_FILE_OUTPUT
open_debug_files();
// open_debug_files
@ -676,8 +677,9 @@ int main_mem(int argc, char *argv[])
#ifdef DEBUG_FILE_OUTPUT
close_files();
#endif
close_files();
return 0;
return 0;
}
int main_fastmap(int argc, char *argv[])

167
fmt_idx.c
View File

@ -1104,6 +1104,171 @@ fmt_smem_end:
return ret;
}
// 找smemseed(lm: long_smem)
int fmt_smem_new(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm,
bwtintv_v *mem, bwtintv_v *tmpvec) {
if (x == 0 || q[x - 1] > 3)
return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem);
// int flag = 0;
int i, ret = x+1, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
uint32_t qbit = 0;
mem->n = 0;
if (q[x] > 3)
return x + 1;
if (min_intv < 1)
min_intv = 1; // the interval size should be at least 1
#define CHECK_INTV_UPDATE(iv, ov, end_pos) \
if (ov.x[2] != iv.x[2]) { \
if (ov.x[2] < min_intv) \
break; \
} \
iv = ov; \
iv.info = end_pos
// 通过反复用forward和backward搜索来找到每个SMEM直到不包含pivot即x
int start = x;
while (1) {
qbit = build_forward_kmer(&q[start], len - start, HASH_KMER_LEN, &kmer_len);
if (kmer_len < HASH_KMER_LEN) {
// 扩展到最远的位置
if (start < x)
break; // 遇到左侧N了
ret = start + kmer_len;
start = ret - HASH_KMER_LEN;
if (start < 0 || q[x-1] > 3)
break;
continue;
}
// forward search
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置
ik.info = start + kmer_len;
if (ik.x[2] < min_intv) {
if (start == x)
ret = ik.info;
if (start > 0) {
--start;
continue;
} else
break;
}
for (i = (int)ik.info; i + 1 < len; i += 2) {
if (q[i] < 4 && q[i + 1] < 4) {
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_UPDATE(ik, ok1, i + 1);
CHECK_INTV_UPDATE(ik, ok2, i + 2);
#if 0
// 在这里进行判断是否只有一个候选了
if (min_intv == 1 && ok2.x[2] == min_intv) {
direct_extend(fmt, len, q, start, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
if (mt.rm[0].qe <= x)
goto smem_end;
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
if (start == x)
ret = (uint32_t)mt.info;
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto smem_end;
i = mt.rm[0].qs - 1;
goto smem_next;
}
#endif
} else if (q[i] < 4) { // q[i+1] >= 4
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
CHECK_INTV_UPDATE(ik, ok1, i + 1);
goto smem_backward_search;
} else {
goto smem_backward_search;
}
}
for (; i == len - 1; ++i) { // 扩展到了最后一个碱基
if (q[i] < 4) {
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
CHECK_INTV_UPDATE(ik, ok1, i + 1);
} else {
goto smem_backward_search;
}
}
if (start == x) ret = ik.info;
if (ik.info <= x) // 不再包含pivot退出
goto smem_end;
// backward search
smem_backward_search:
#define CHECK_INTV_BREAK(ik, ok, spos) \
if (ok.x[2] < min_intv) { \
i = spos; \
goto smem_next; \
} \
ok.info = ik.info; \
ik = ok;
#define CHECK_INTV_END(ik, ok) \
if (ok.x[2] < min_intv) { \
CHECK_ADD_MEM(i + 1, ik, mem); \
goto smem_end; \
} \
ok.info = ik.info; \
ik = ok;
i = start - 1;
for (; i > 0; i -= 2) {
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
{
fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 1, q[i], q[i - 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_BREAK(ik, ok1, i);
CHECK_INTV_BREAK(ik, ok2, i-1);
} else if (q[i] < 4) // 只能扩展一个
{
fmt_direct_extend1(fmt, &ik, &ok1, 1, q[i]);
CHECK_INTV_END(ik, ok1);
--i; // 避免进行i-2
break;
} else { // 不能扩展
CHECK_ADD_MEM(i + 1, ik, mem);
goto smem_end;
}
}
for (; i == 0; --i) { // 扩展到了第一个碱基
if (q[i] < 4) {
fmt_direct_extend1(fmt, &ik, &ok1, 1, q[i]);
CHECK_INTV_BREAK(ik, ok1, i);
} else {
CHECK_ADD_MEM(i + 1, ik, mem);
goto smem_end;
}
}
if (i == -1) {
CHECK_ADD_MEM(i + 1, ik, mem);
goto smem_end;
}
smem_next:
ik.info |= (uint64_t)(i+1) << 32;
CHECK_ADD_MEM(i + 1, ik, mem);
if (start < 0 || q[start] > 3) {
break;
}
start = i;
}
smem_end:
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
// printf("%d\n", ret);
return ret;
}
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, kmer_len, first_extend_len;
@ -1310,4 +1475,4 @@ bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k)
}
sa = (sa << 48) | (k / fmt->sa_intv);
return sa;
}
}

View File

@ -110,6 +110,7 @@ void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec);
int fmt_smem_new(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec);
int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev);
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);

View File

@ -2,10 +2,6 @@
#include <stdio.h>
#include "kstring.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int ksprintf(kstring_t *s, const char *fmt, ...)
{
va_list ap;

View File

@ -23,10 +23,20 @@ tdat[0]: read nums
tdat[1]: seed-1 full match nums
*/
int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
#endif
int64_t sum(int64_t *a, int len) {
int64_t res = 0;
int i = 0;
for (i=0; i<len; ++i) {
res += a[i];
}
return res;
}
int find_opt(uint64_t *a, int len, double *max, double *min, double *avg)
{
int i = 0;
@ -47,6 +57,7 @@ int display_stats(int nthreads)
{
#ifdef SHOW_PERF
double avg, max, min;
int i;
fprintf(stderr, "[steps in main_mem]\n");
fprintf(stderr, "time_parse_arg: %0.2lf s\n", gprof[G_PREPARE] * 1.0 / proc_freq);
@ -104,8 +115,18 @@ int display_stats(int nthreads)
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq);
fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq);
fprintf(stderr, "seq num: %ld\n", gdat[0]);
fprintf(stderr, "full num: %ld\n", gdat[1]);
fprintf(stderr, "percent: %0.2lf%c\n", (double)gdat[1] / gdat[0] * 100, '%');
//for (i=0; i<LIM_THREAD_DATA_TYPE; ++i) {
for (i=1; i<=132; ++i) {
//fprintf(stderr, "len: %d, sum: %ld\n", i, sum(tdat[i], nthreads));
fprintf(stderr, "%ld,\n", sum(tdat[i], nthreads));
}
fprintf(stderr, "\n");
#endif
return 1;
}
}

View File

@ -16,8 +16,8 @@ Date : 2024/04/06
#define LIM_THREAD 128
#define LIM_THREAD_PROF_TYPE 128
#define LIM_GLOBAL_PROF_TYPE 128
#define LIM_THREAD_DATA_TYPE 128
#define LIM_GLOBAL_DATA_TYPE 128
#define LIM_THREAD_DATA_TYPE 256
#define LIM_GLOBAL_DATA_TYPE 256
#ifdef SHOW_PERF
extern uint64_t proc_freq;
@ -27,6 +27,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
#ifdef SHOW_DATA_PERF
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
extern int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD];
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
#endif
@ -89,4 +90,4 @@ enum
int display_stats(int);
#endif
#endif

61
run.sh
View File

@ -1,53 +1,24 @@
thread=64
thread=1
## d1 k18<=4 89%
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
#n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
## d2 <= 4 87%
#n_r1=~/data/fastq/dataset/na12878_wgs_101/na12878_r1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/na12878_r2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
# d3 <= 4 77%
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
## d4 <= 4 93%
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
## d5 <= 4 80%
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq
n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq
n_r1=~/data/dataset/real/D1/n1.fq.gz
n_r2=~/data/dataset/real/D1/n2.fq.gz
#n_r1=~/data/dataset/real/D3/n1.fq.gz
#n_r2=~/data/dataset/real/D3/n2.fq.gz
#n_r1=~/data/dataset/real/D3/n1.fq
#n_r2=~/data/dataset/real/D3/n2.fq
#n_r1=~/data/t120w-1.fq
#n_r2=~/data/t120w-2.fq
#n_r1=~/data/1w-1.fq
#n_r2=~/data/1w-2.fq
reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#out=~/data1/out-u8-1.sam
#out=~/data1/out-i16.sam
#out=~/data1/fmt-out.sam
out=~/data/fastbwa.ert.sam
#out=/dev/null
#out=new-wgs.sam
out=/dev/null
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
$reference \
$n_r1 \
$n_r2 \
-o $out -2 -Z
-o $out -2 #> seed.txt #-Z