hybrid index实现seeding过程的初始代码
This commit is contained in:
parent
b50360bb48
commit
fb5279120b
|
|
@ -18,11 +18,10 @@
|
|||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta",
|
||||
"~/data/fastq/dataset/na12878_wes_144/1w_1.fq",
|
||||
"~/data/fastq/dataset/na12878_wes_144/1w_2.fq",
|
||||
"~/data/dataset/real/D3/n1.fq",
|
||||
"~/data/dataset/real/D3/n2.fq",
|
||||
"-o",
|
||||
"/dev/null",
|
||||
"-Z"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
|
|
|
|||
|
|
@ -42,6 +42,8 @@
|
|||
"ertindex.h": "c",
|
||||
"ertseeding.h": "c",
|
||||
"algorithm": "c",
|
||||
"filesystem": "c"
|
||||
"filesystem": "c",
|
||||
"chrono": "c",
|
||||
"queue": "c"
|
||||
}
|
||||
}
|
||||
2
Makefile
2
Makefile
|
|
@ -1,5 +1,5 @@
|
|||
CC= gcc
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 #-O2
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
|
||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
|
||||
SHOW_PERF= -DSHOW_PERF
|
||||
|
|
|
|||
2
bwa.h
2
bwa.h
|
|
@ -95,7 +95,7 @@ extern "C" {
|
|||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
|
||||
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size);
|
||||
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size, int num_thread);
|
||||
|
||||
char *bwa_idx_infer_prefix(const char *hint);
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||
|
|
|
|||
64
bwamem.c
64
bwamem.c
|
|
@ -684,6 +684,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
uint8_t *rseq = 0;
|
||||
uint64_t *srt;
|
||||
smem_aux_t *aux = (smem_aux_t*)buf;
|
||||
kvec_t(int) qlenv;
|
||||
kv_init(qlenv);
|
||||
|
||||
if (c->n == 0) return;
|
||||
// get the max possible span
|
||||
|
|
@ -763,6 +765,10 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
|
||||
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name);
|
||||
if (s->qbeg) { // left extension
|
||||
// s-qbeg是seed左侧需要扩展的长度
|
||||
//tdat[s->qbeg][tid]++;
|
||||
kv_push(int, qlenv, s->qbeg);
|
||||
fprintf(stderr, "%d,\n", s->qbeg);
|
||||
int qle, tle, gtle, gscore;
|
||||
tmp = s->rbeg - rmax[0];
|
||||
#ifndef USE_AVX2
|
||||
|
|
@ -807,6 +813,9 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
|
||||
qe = s->qbeg + s->len;
|
||||
re = s->rbeg + s->len - rmax[0];
|
||||
kv_push(int, qlenv, l_query - qe);
|
||||
fprintf(stderr, "%d,\n", l_query - qe);
|
||||
// tdat[l_query - qe][tid]++;
|
||||
assert(re >= 0);
|
||||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||
int prev = a->score;
|
||||
|
|
@ -848,6 +857,21 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
|
||||
a->frac_rep = c->frac_rep;
|
||||
}
|
||||
|
||||
tdat[qlenv.n][tid] ++;
|
||||
// 计算标准差
|
||||
double avgl = 0, stdl = 0;
|
||||
for (i=0; i<qlenv.n; ++i) {
|
||||
avgl += qlenv.a[i];
|
||||
}
|
||||
avgl = avgl / qlenv.n;
|
||||
for (i=0; i<qlenv.n; ++i) {
|
||||
stdl += pow((avgl - qlenv.a[i]), 2);
|
||||
}
|
||||
stdl = sqrt(stdl);
|
||||
|
||||
kv_destroy(qlenv);
|
||||
|
||||
free(srt);
|
||||
free(rseq);
|
||||
}
|
||||
|
|
@ -1304,16 +1328,20 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
// 1. first pass: find all SMEMs
|
||||
// goto third_seed;
|
||||
PROF_START(seed_1);
|
||||
while (x < len) {
|
||||
__sync_fetch_and_add(&gdat[0], 1);
|
||||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
start_flag = 0;
|
||||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||||
//x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||||
x = fmt_smem_new(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
max_seed_len = MAX(max_seed_len, slen);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
kv_push(bwtintv_t, smem->mem, *p);
|
||||
//fprintf(gf[0], "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
|
||||
//fprintf(stderr, "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
|
||||
}
|
||||
}
|
||||
// break; // for test
|
||||
|
|
@ -1322,30 +1350,40 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
if (start_flag) ++start_N_num;
|
||||
}
|
||||
}
|
||||
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
||||
// && smem->mem.a[0].x[2] == 1
|
||||
if (max_seed_len == len - start_N_num && smem->mem.n > 0 && smem->mem.a[0].x[2] == 1) {
|
||||
__sync_fetch_and_add(&gdat[1], 1);
|
||||
}
|
||||
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
||||
//goto third_seed;
|
||||
#ifdef FILTER_FULL_MATCH
|
||||
if (max_seed_len == len - start_N_num) {
|
||||
goto collect_intv_end;
|
||||
}
|
||||
#endif
|
||||
// 2. second pass: find MEMs inside a long SMEM
|
||||
PROF_START(seed_2);
|
||||
//fprintf(gf[0], "second %ld\n", ++debug_num);
|
||||
// 2. second pass: find MEMs inside a long SMEM
|
||||
// fprintf(stderr, "%ld\n", debug_num++);
|
||||
PROF_START(seed_2);
|
||||
old_n = smem->mem.n;
|
||||
for (k = 0; k < old_n; ++k) {
|
||||
bwtintv_t *p = &smem->mem.a[k];
|
||||
int start = p->info>>32, end = (int32_t)p->info;
|
||||
if (end - start < split_len || p->x[2] > opt->split_width) continue;
|
||||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
//fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
|
||||
fmt_smem_new(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
|
||||
// fprintf(stderr, "%ld\n", debug_num++);
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
|
||||
}
|
||||
//fprintf(gf[0], "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
||||
//fprintf(gf[0], "\n");
|
||||
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
||||
|
||||
//third_seed:
|
||||
// 3. third pass: LAST-like
|
||||
|
|
@ -1358,7 +1396,8 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||
if (m.x[2] > 0) {
|
||||
kv_push(bwtintv_t, smem->mem, m);
|
||||
}
|
||||
//fprintf(stderr, "seed_3: %ld %d %ld %ld\n", m.info >> 32, (uint32_t)m.info, (-(m.info >> 32) + (uint32_t)m.info), m.x[2]);
|
||||
}
|
||||
} else ++x;
|
||||
}
|
||||
}
|
||||
|
|
@ -1369,6 +1408,11 @@ collect_intv_end:
|
|||
#endif
|
||||
// sort
|
||||
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
|
||||
// for (i = 0; i < smem->mem.n; ++i) {
|
||||
// bwtintv_t *p = &smem->mem.a[i];
|
||||
// fprintf(gf[0], "seed: %ld\t%d\t%ld\t%ld\n", p->info >> 32, (uint32_t)p->info, (-(p->info >> 32) + (uint32_t)p->info), p->x[2]);
|
||||
// }
|
||||
// fprintf(gf[0], "\n");
|
||||
}
|
||||
|
||||
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)
|
||||
|
|
|
|||
18
bwt.c
18
bwt.c
|
|
@ -62,7 +62,8 @@ static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inver
|
|||
// 设置某一行的排序值,sa的索引有效值从1开始,(0设置为-1, 小端模式)
|
||||
void inline bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
|
||||
{
|
||||
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节
|
||||
//const bwtint_t block_idx = (k >> 3) * 33;
|
||||
const bwtint_t block_idx = ((k >> 3) << 5) + (k>>3); // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx);
|
||||
|
|
@ -73,13 +74,20 @@ void inline bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
|
|||
// 获取某一行的排序值(小端模式)
|
||||
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k)
|
||||
{
|
||||
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
// const bwtint_t block_idx = (k >> 3) * 33;
|
||||
const bwtint_t block_idx = ((k >> 3) << 5) + (k >> 3); // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t val = *(bwtint_t *)(sa_arr + start_byte_idx);
|
||||
val = (val >> val_idx_in_block) & 8589934591;
|
||||
return val;
|
||||
}
|
||||
bwtint_t bwt_get_sa_2(uint8_t *sa_arr, bwtint_t k) {
|
||||
const bwtint_t start_byte = ((k << 5) + k) >> 3; // 存储这个sa数据的起始字节
|
||||
bwtint_t val = *(bwtint_t *)(sa_arr + start_byte);
|
||||
val = (val >> (k & 7)) & 8589934591;
|
||||
return val;
|
||||
}
|
||||
|
||||
// 获取kmer的fmt匹配信息
|
||||
inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos)
|
||||
|
|
@ -147,7 +155,7 @@ inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit
|
|||
}
|
||||
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
void bwt_cal_byte_sa(bwt_t *bwt, int intv)
|
||||
void bwt_cal_byte_sa(bwt_t *bwt, int intv, int num_thread)
|
||||
{
|
||||
bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引,sa表示位置
|
||||
double tmp_time, elapsed_time;
|
||||
|
|
@ -184,7 +192,7 @@ void bwt_cal_byte_sa(bwt_t *bwt, int intv)
|
|||
}
|
||||
if (isa % intv == 0) bwt_set_sa(bwt->byte_sa, isa / intv, sa);
|
||||
// bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了
|
||||
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
|
||||
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // 2的33次方-1 before this line, bwt->sa[0] = bwt->seq_len
|
||||
}
|
||||
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
|
|
|
|||
4
bwt.h
4
bwt.h
|
|
@ -103,7 +103,7 @@ typedef struct {
|
|||
|
||||
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
|
||||
|
||||
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
|
||||
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)/* 8字节对齐*/
|
||||
|
||||
/* For general OCC_INTERVAL, the following is correct:
|
||||
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
|
||||
|
|
@ -137,7 +137,7 @@ extern "C" {
|
|||
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
|
||||
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv);
|
||||
void bwt_cal_byte_sa(bwt_t *bwt, int intv);
|
||||
void bwt_cal_byte_sa(bwt_t *bwt, int intv, int num_thread);
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt);
|
||||
|
||||
|
|
|
|||
14
bwtindex.c
14
bwtindex.c
|
|
@ -261,10 +261,14 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
|||
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, sa_intv = 32;
|
||||
int c, sa_intv = 32, num_thread = 1;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i': sa_intv = atoi(optarg); break;
|
||||
case 't':
|
||||
num_thread = atoi(optarg);
|
||||
if (num_thread < 1) num_thread = 1;
|
||||
break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
|
@ -273,7 +277,7 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command
|
|||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
bwt_cal_byte_sa(bwt, sa_intv);
|
||||
bwt_cal_byte_sa(bwt, sa_intv, num_thread);
|
||||
bwt_dump_byte_sa(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
|
|
@ -389,12 +393,12 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
|||
strcpy(prefix, argv[optind]);
|
||||
if (is_64) strcat(prefix, ".64");
|
||||
}
|
||||
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
|
||||
bwa_idx_build(argv[optind], prefix, algo_type, block_size, num_threads);
|
||||
free(prefix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size)
|
||||
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size, int num_threads)
|
||||
{
|
||||
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
||||
|
||||
|
|
@ -475,7 +479,7 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
|
|||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
// create byte sa
|
||||
t = clock();
|
||||
bwt_cal_byte_sa(bwt, 4);
|
||||
bwt_cal_byte_sa(bwt, 4, num_threads);
|
||||
strcpy(str, prefix); strcat(str, ".bytesa");
|
||||
bwt_dump_byte_sa(str, bwt);
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
|
|
|
|||
2
debug.c
2
debug.c
|
|
@ -13,6 +13,8 @@ FILE *gf[4] = {0},
|
|||
*gft[4] = {0},
|
||||
*gfi[4] = {0};
|
||||
|
||||
uint64_t debug_num = 0;
|
||||
|
||||
int open_qti_files()
|
||||
{
|
||||
char fn[1024] = {0};
|
||||
|
|
|
|||
3
debug.h
3
debug.h
|
|
@ -6,6 +6,7 @@
|
|||
Date : 2024/04/09
|
||||
***********************************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
////////////////// for debug and test //////////////////////////
|
||||
|
||||
|
|
@ -21,6 +22,8 @@
|
|||
|
||||
extern FILE *gf[4], *gfq[4], *gft[4], *gfi[4];
|
||||
|
||||
extern uint64_t debug_num;
|
||||
|
||||
int open_qti_files();
|
||||
|
||||
int open_debug_files();
|
||||
|
|
|
|||
|
|
@ -344,6 +344,7 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
|
|||
|
||||
int main_mem(int argc, char *argv[])
|
||||
{
|
||||
open_debug_files();
|
||||
#ifdef DEBUG_FILE_OUTPUT
|
||||
open_debug_files();
|
||||
// open_debug_files
|
||||
|
|
@ -676,8 +677,9 @@ int main_mem(int argc, char *argv[])
|
|||
#ifdef DEBUG_FILE_OUTPUT
|
||||
close_files();
|
||||
#endif
|
||||
close_files();
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_fastmap(int argc, char *argv[])
|
||||
|
|
|
|||
167
fmt_idx.c
167
fmt_idx.c
|
|
@ -1104,6 +1104,171 @@ fmt_smem_end:
|
|||
return ret;
|
||||
}
|
||||
|
||||
// 找smem(seed)(lm: long_smem)
|
||||
int fmt_smem_new(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm,
|
||||
bwtintv_v *mem, bwtintv_v *tmpvec) {
|
||||
if (x == 0 || q[x - 1] > 3)
|
||||
return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem);
|
||||
|
||||
// int flag = 0;
|
||||
int i, ret = x+1, kmer_len;
|
||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
||||
bwtintv_t mt = {0};
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
if (q[x] > 3)
|
||||
return x + 1;
|
||||
|
||||
if (min_intv < 1)
|
||||
min_intv = 1; // the interval size should be at least 1
|
||||
|
||||
|
||||
#define CHECK_INTV_UPDATE(iv, ov, end_pos) \
|
||||
if (ov.x[2] != iv.x[2]) { \
|
||||
if (ov.x[2] < min_intv) \
|
||||
break; \
|
||||
} \
|
||||
iv = ov; \
|
||||
iv.info = end_pos
|
||||
|
||||
// 通过反复用forward和backward搜索来找到每个SMEM,直到不包含pivot(即x)
|
||||
int start = x;
|
||||
while (1) {
|
||||
qbit = build_forward_kmer(&q[start], len - start, HASH_KMER_LEN, &kmer_len);
|
||||
if (kmer_len < HASH_KMER_LEN) {
|
||||
// 扩展到最远的位置
|
||||
if (start < x)
|
||||
break; // 遇到左侧N了
|
||||
ret = start + kmer_len;
|
||||
start = ret - HASH_KMER_LEN;
|
||||
if (start < 0 || q[x-1] > 3)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
// forward search
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置
|
||||
ik.info = start + kmer_len;
|
||||
if (ik.x[2] < min_intv) {
|
||||
if (start == x)
|
||||
ret = ik.info;
|
||||
if (start > 0) {
|
||||
--start;
|
||||
continue;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2) {
|
||||
if (q[i] < 4 && q[i + 1] < 4) {
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_UPDATE(ik, ok1, i + 1);
|
||||
CHECK_INTV_UPDATE(ik, ok2, i + 2);
|
||||
#if 0
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv) {
|
||||
direct_extend(fmt, len, q, start, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
||||
if (mt.rm[0].qe <= x)
|
||||
goto smem_end;
|
||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
||||
if (start == x)
|
||||
ret = (uint32_t)mt.info;
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto smem_end;
|
||||
i = mt.rm[0].qs - 1;
|
||||
goto smem_next;
|
||||
}
|
||||
#endif
|
||||
} else if (q[i] < 4) { // q[i+1] >= 4
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
CHECK_INTV_UPDATE(ik, ok1, i + 1);
|
||||
goto smem_backward_search;
|
||||
} else {
|
||||
goto smem_backward_search;
|
||||
}
|
||||
}
|
||||
for (; i == len - 1; ++i) { // 扩展到了最后一个碱基
|
||||
if (q[i] < 4) {
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
CHECK_INTV_UPDATE(ik, ok1, i + 1);
|
||||
} else {
|
||||
goto smem_backward_search;
|
||||
}
|
||||
}
|
||||
if (start == x) ret = ik.info;
|
||||
if (ik.info <= x) // 不再包含pivot,退出
|
||||
goto smem_end;
|
||||
|
||||
// backward search
|
||||
smem_backward_search:
|
||||
|
||||
#define CHECK_INTV_BREAK(ik, ok, spos) \
|
||||
if (ok.x[2] < min_intv) { \
|
||||
i = spos; \
|
||||
goto smem_next; \
|
||||
} \
|
||||
ok.info = ik.info; \
|
||||
ik = ok;
|
||||
|
||||
#define CHECK_INTV_END(ik, ok) \
|
||||
if (ok.x[2] < min_intv) { \
|
||||
CHECK_ADD_MEM(i + 1, ik, mem); \
|
||||
goto smem_end; \
|
||||
} \
|
||||
ok.info = ik.info; \
|
||||
ik = ok;
|
||||
|
||||
i = start - 1;
|
||||
for (; i > 0; i -= 2) {
|
||||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_BREAK(ik, ok1, i);
|
||||
CHECK_INTV_BREAK(ik, ok2, i-1);
|
||||
} else if (q[i] < 4) // 只能扩展一个
|
||||
{
|
||||
fmt_direct_extend1(fmt, &ik, &ok1, 1, q[i]);
|
||||
CHECK_INTV_END(ik, ok1);
|
||||
--i; // 避免进行i-2
|
||||
break;
|
||||
} else { // 不能扩展
|
||||
CHECK_ADD_MEM(i + 1, ik, mem);
|
||||
goto smem_end;
|
||||
}
|
||||
}
|
||||
for (; i == 0; --i) { // 扩展到了第一个碱基
|
||||
if (q[i] < 4) {
|
||||
fmt_direct_extend1(fmt, &ik, &ok1, 1, q[i]);
|
||||
CHECK_INTV_BREAK(ik, ok1, i);
|
||||
} else {
|
||||
CHECK_ADD_MEM(i + 1, ik, mem);
|
||||
goto smem_end;
|
||||
}
|
||||
}
|
||||
if (i == -1) {
|
||||
CHECK_ADD_MEM(i + 1, ik, mem);
|
||||
goto smem_end;
|
||||
}
|
||||
|
||||
smem_next:
|
||||
ik.info |= (uint64_t)(i+1) << 32;
|
||||
CHECK_ADD_MEM(i + 1, ik, mem);
|
||||
if (start < 0 || q[start] > 3) {
|
||||
break;
|
||||
}
|
||||
start = i;
|
||||
}
|
||||
|
||||
smem_end:
|
||||
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||
// printf("%d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, kmer_len, first_extend_len;
|
||||
|
|
@ -1310,4 +1475,4 @@ bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k)
|
|||
}
|
||||
sa = (sa << 48) | (k / fmt->sa_intv);
|
||||
return sa;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
|
|||
void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
|
||||
// 找smem(seed)
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec);
|
||||
int fmt_smem_new(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec);
|
||||
int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev);
|
||||
|
||||
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||
|
|
|
|||
|
|
@ -2,10 +2,6 @@
|
|||
#include <stdio.h>
|
||||
#include "kstring.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int ksprintf(kstring_t *s, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
|
|
|||
23
profiling.c
23
profiling.c
|
|
@ -23,10 +23,20 @@ tdat[0]: read nums
|
|||
tdat[1]: seed-1 full match nums
|
||||
*/
|
||||
int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
|
||||
int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
|
||||
int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
|
||||
|
||||
#endif
|
||||
|
||||
int64_t sum(int64_t *a, int len) {
|
||||
int64_t res = 0;
|
||||
int i = 0;
|
||||
for (i=0; i<len; ++i) {
|
||||
res += a[i];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
int find_opt(uint64_t *a, int len, double *max, double *min, double *avg)
|
||||
{
|
||||
int i = 0;
|
||||
|
|
@ -47,6 +57,7 @@ int display_stats(int nthreads)
|
|||
{
|
||||
#ifdef SHOW_PERF
|
||||
double avg, max, min;
|
||||
int i;
|
||||
|
||||
fprintf(stderr, "[steps in main_mem]\n");
|
||||
fprintf(stderr, "time_parse_arg: %0.2lf s\n", gprof[G_PREPARE] * 1.0 / proc_freq);
|
||||
|
|
@ -104,8 +115,18 @@ int display_stats(int nthreads)
|
|||
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq);
|
||||
fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq);
|
||||
|
||||
fprintf(stderr, "seq num: %ld\n", gdat[0]);
|
||||
fprintf(stderr, "full num: %ld\n", gdat[1]);
|
||||
fprintf(stderr, "percent: %0.2lf%c\n", (double)gdat[1] / gdat[0] * 100, '%');
|
||||
|
||||
//for (i=0; i<LIM_THREAD_DATA_TYPE; ++i) {
|
||||
for (i=1; i<=132; ++i) {
|
||||
//fprintf(stderr, "len: %d, sum: %ld\n", i, sum(tdat[i], nthreads));
|
||||
fprintf(stderr, "%ld,\n", sum(tdat[i], nthreads));
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@ Date : 2024/04/06
|
|||
#define LIM_THREAD 128
|
||||
#define LIM_THREAD_PROF_TYPE 128
|
||||
#define LIM_GLOBAL_PROF_TYPE 128
|
||||
#define LIM_THREAD_DATA_TYPE 128
|
||||
#define LIM_GLOBAL_DATA_TYPE 128
|
||||
#define LIM_THREAD_DATA_TYPE 256
|
||||
#define LIM_GLOBAL_DATA_TYPE 256
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
extern uint64_t proc_freq;
|
||||
|
|
@ -27,6 +27,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
|
|||
|
||||
#ifdef SHOW_DATA_PERF
|
||||
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
|
||||
extern int64_t t_sd[LIM_THREAD_DATA_TYPE][LIM_THREAD];
|
||||
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
|
||||
#endif
|
||||
|
||||
|
|
@ -89,4 +90,4 @@ enum
|
|||
|
||||
int display_stats(int);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
61
run.sh
61
run.sh
|
|
@ -1,53 +1,24 @@
|
|||
thread=64
|
||||
thread=1
|
||||
|
||||
## d1 k18<=4 89%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
|
||||
## d2 <= 4 87%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/na12878_r1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/na12878_r2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
|
||||
# d3 <= 4 77%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
|
||||
## d4 <= 4 93%
|
||||
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
|
||||
## d5 <= 4 80%
|
||||
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
|
||||
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
|
||||
n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq
|
||||
n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq
|
||||
n_r1=~/data/dataset/real/D1/n1.fq.gz
|
||||
n_r2=~/data/dataset/real/D1/n2.fq.gz
|
||||
#n_r1=~/data/dataset/real/D3/n1.fq.gz
|
||||
#n_r2=~/data/dataset/real/D3/n2.fq.gz
|
||||
#n_r1=~/data/dataset/real/D3/n1.fq
|
||||
#n_r2=~/data/dataset/real/D3/n2.fq
|
||||
|
||||
#n_r1=~/data/t120w-1.fq
|
||||
#n_r2=~/data/t120w-2.fq
|
||||
|
||||
#n_r1=~/data/1w-1.fq
|
||||
#n_r2=~/data/1w-2.fq
|
||||
|
||||
reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
|
||||
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
|
||||
#out=~/data1/out-u8-1.sam
|
||||
#out=~/data1/out-i16.sam
|
||||
#out=~/data1/fmt-out.sam
|
||||
out=~/data/fastbwa.ert.sam
|
||||
#out=/dev/null
|
||||
|
||||
#out=new-wgs.sam
|
||||
out=/dev/null
|
||||
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
$reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
-o $out -2 -Z
|
||||
-o $out -2 #> seed.txt #-Z
|
||||
|
|
|
|||
Loading…
Reference in New Issue