所有代码都合并了,还差一点建立索引的时候,一起都建立了

This commit is contained in:
zzh 2024-04-02 07:42:37 +08:00
parent dd03596997
commit 20e072f6af
31 changed files with 359 additions and 121 deletions

1
.gitignore vendored
View File

@ -5,6 +5,7 @@
*.fa
dataset/
bwa
fastbwa
test
test64
.*.swp

19
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/bwa",
"program": "${workspaceRoot}/fastbwa",
"args": [
"mem",
"-t",
@ -30,10 +30,23 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/bwa",
"program": "${workspaceRoot}/fastbwa",
"args": [
"index",
"/mnt/d/data/reference/human_g1k_v37_decoy.fasta"
"~/data/reference/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "buildkmer",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/fastbwa",
"args": [
"buildkmer",
"~/data/reference/human_g1k_v37_decoy.fasta.256.64.fmt",
"~/data/reference/human_g1k_v37_decoy.fasta.kmer"
],
"cwd": "${workspaceFolder}", //
}

View File

@ -16,6 +16,12 @@
"emmintrin.h": "c",
"bwamem.h": "c",
"utils.h": "c",
"stdio.h": "c"
"stdio.h": "c",
"kvec.h": "c",
"string.h": "c",
"stdlib.h": "c",
"array": "c",
"initializer_list": "c",
"utility": "c"
}
}

View File

@ -6,13 +6,11 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
DEBUG_OUTPUT= #-DDEBUG_OUTPUT
DEBUG_SW_EXTEND= #-DDEBUG_SW_EXTEND
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
USE_MT_READ= #-DUSE_MT_READ
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(DEBUG_OUTPUT) $(DEBUG_SW_EXTEND) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o yarn.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
@ -20,11 +18,11 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o \
fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o
PROG= bwa
PROG= fastbwa
INCLUDES=
LIBS= -lm -lz -lpthread -ldl
SUBDIRS= .
JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a # -ljemalloc -L/home/zzh/work/jemalloc/lib/
JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a
ifeq ($(shell uname -s),Linux)
LIBS += -lrt
@ -40,7 +38,7 @@ all:$(PROG)
#bwa:libbwa.a $(AOBJS) main.o
# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
bwa:libbwa.a $(AOBJS) main.o
$(PROG):libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) $(JE_MALLOC) main.o -o $@ -L. -lbwa $(LIBS)
bwamem-lite:libbwa.a example.o

View File

@ -342,7 +342,7 @@ int bwa_fa2pac(int argc, char *argv[])
}
}
if (argc == optind) {
fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
fprintf(stderr, "Usage: fastbwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
return 1;
}
fp = xzopen(argv[optind], "r");

3
bwa.c
View File

@ -455,9 +455,11 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
//kmer_bit_fn = malloc(l_hint + 32);
sa_fn = malloc(l_hint + 32);
sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
// sprintf(suffix, ".fmt");
strcpy(fmt_idx_fn, hint);
strcpy(fmt_idx_fn + l_hint, suffix);
sprintf(suffix, ".%d.kmer", HASH_KMER_LEN);
// sprintf(suffix, ".kmer");
strcpy(kmer_idx_fn, hint);
strcpy(kmer_idx_fn + l_hint, suffix);
@ -473,6 +475,7 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
strcpy(sa_fn, hint);
sprintf(suffix, ".33.%d.sa", SA_INTV);
// sprintf(suffix, ".bytesa");
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
fmt_restore_sa(sa_fn, fmt);

View File

@ -1447,9 +1447,17 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
while (x < len) {
if (seq[x] < 4) {
start_flag = 0;
//int last_x = x;
#ifdef DEBUG_OUTPUT
#ifdef COUNT_SEED_LENGTH
int last_x = x;
#endif
#endif
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
//fprintf(gfp1, "%d\n", x - last_x);
#ifdef DEBUG_OUTPUT
#ifdef COUNT_SEED_LENGTH
fprintf(gfp1, "%d\n", x - last_x);
#endif
#endif
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
@ -1458,7 +1466,9 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
kv_push(bwtintv_t, smem->mem, *p);
}
}
//break; // for test full match time
#ifdef COUNT_SEED_LENGTH
break; // for test full match time
#endif
} else {
++x;
if (start_flag) ++start_N_num;
@ -1486,9 +1496,11 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
#ifdef DEBUG_OUTPUT
if (start_N_num == 0) {
#ifdef GET_FULL_MATCH_READ
for (i = 0; i < len; ++i)
fprintf(gfp1, "%c", "ACGT"[seq[i]]);
fprintf(gfp1, "\n");
#endif
#ifdef SHOW_DATA_PERF
gdat[2]++;
if (gdat[2] % 100 == 0) {

View File

@ -757,7 +757,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
if (optind + 5 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
fprintf(stderr, "Usage: fastbwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);

View File

@ -593,7 +593,7 @@ int bwa_sai2sam_se(int argc, char *argv[])
}
if (optind + 3 > argc) {
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
fprintf(stderr, "Usage: fastbwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
return 1;
}
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {

View File

@ -186,7 +186,7 @@ int main_shm(int argc, char *argv[])
else if (c == 'f') tmpfn = optarg;
}
if (optind == argc && !to_list && !to_drop) {
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
fprintf(stderr, "\nUsage: fastbwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
fprintf(stderr, " -l list names of indices in shared memory\n");
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");

View File

@ -273,7 +273,7 @@ int bwa_aln(int argc, char *argv[])
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
fprintf(stderr, "Usage: fastbwa aln [options] <prefix> <in.fq>\n\n");
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
BWA_AVG_ERR, opt->fnr);
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);

View File

@ -136,7 +136,7 @@ int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
fprintf(stderr, "Usage: fastbwa pac2bwt [-d] <in.pac> <out.bwt>\n");
return 1;
}
bwt = bwt_pac2bwt(argv[optind], use_is);
@ -175,7 +175,7 @@ int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
{
bwt_t *bwt;
if (argc != 2) {
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
fprintf(stderr, "Usage: fastbwa bwtupdate <the.bwt>\n");
return 1;
}
bwt = bwt_restore_bwt(argv[1]);
@ -196,7 +196,7 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
fprintf(stderr, "Usage: fastbwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
@ -206,7 +206,7 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
return 0;
}
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command
{
bwt_t *bwt;
int c, sa_intv = 32;
@ -217,7 +217,7 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa bwt2bytesa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
fprintf(stderr, "Usage: fastbwa bwt2bytesa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
@ -227,6 +227,37 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
return 0;
}
int bwa_bwt2fmt(int argc, char *argv[]) // create fmt index
{
bwt_t *bwt;
char buf[1024];
if (optind + 1 > argc) {
fprintf(stderr, "Usage: fastbwa bwt2fmt <in.bwt> <out.fmt>\n");
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
FMTIndex *fmt;
fmt = create_fmt_from_bwt(bwt);
dump_fmt(argv[optind + 1], fmt);
// sprintf(buf, "%s.kmer", argv[optind + 1]);
return 0;
}
int bwa_build_kmer(int argc, char *argv[])
{
char buf[1024];
if (optind + 1 > argc)
{
fprintf(stderr, "Usage: fastbwa build_kmerhash <in.fmt> <out.kmerhash>\n");
return 1;
}
FMTIndex *fmt = fmt_restore_fmt(argv[optind]);
fmt_create_kmer_index(fmt);
sprintf(buf, "%s", argv[optind + 1]);
fmt_dump_kmer_idx(buf, &fmt->kmer_hash);
return 0;
}
int bwa_index(int argc, char *argv[]) // the "index" command
{
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
@ -253,7 +284,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
if (optind + 1 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Usage: fastbwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
@ -338,6 +369,15 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
bwt_dump_sa(str3, bwt);
bwt_destroy(bwt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
{
// build FMT-Index
FMTIndex *fmt;
strcpy(str, prefix); strcat(str, ".fmt");
fmt = create_fmt_from_bwt(bwt);
dump_fmt(str, fmt);
// create Kmer-Hash
}
}
free(str3); free(str2); free(str);
return 0;

View File

@ -44,7 +44,7 @@ int bwa_bwtsw2(int argc, char *argv[])
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
fprintf(stderr, "Usage: fastbwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);

View File

@ -35,7 +35,7 @@ out=./out.sam
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
# -o /dev/null
time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
$reference \
$n_r1 \
$n_r2 \

View File

@ -570,7 +570,7 @@ int main_mem(int argc, char *argv[])
if (optind + 1 >= argc || optind + 3 < argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
fprintf(stderr, "Usage: fastbwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
fprintf(stderr, "Algorithm options:\n\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -b INT batch size of reads to process at one time [%d]\n", opt->batch_size);
@ -849,7 +849,7 @@ int main_fastmap(int argc, char *argv[])
}
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
fprintf(stderr, "Usage: fastbwa fastmap [options] <idxbase> <in.fq>\n\n");
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);

197
fmt_idx.c
View File

@ -15,22 +15,23 @@ Date : 2023/12/24
#include "utils.h"
#include "bntseq.h"
#include "kvec.h"
#include "kstring.h"
const static char BASE[4] = {'A', 'C', 'G', 'T'};
//const static char BASE[4] = {'A', 'C', 'G', 'T'};
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len)
{
uint32_t seq_up_val = (1 << (kmer_len << 1));
for (uint32_t i = 0; i < seq_up_val; ++i)
{
seq_arr[i] = (char *)malloc(kmer_len);
for (int j = kmer_len - 1; j >= 0; --j)
{
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
}
}
}
//void gen_all_seq(char **seq_arr, int kmer_len)
//{
// uint32_t seq_up_val = (1 << (kmer_len << 1));
// for (uint32_t i = 0; i < seq_up_val; ++i)
// {
// seq_arr[i] = (char *)malloc(kmer_len);
// for (int j = kmer_len - 1; j >= 0; --j)
// {
// seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
// }
// }
//}
// 生成occ每个字节对应一个pattern
void fmt_gen_cnt_occ(FMTIndex *fmt)
@ -123,8 +124,12 @@ void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh)
err_fwrite(kh->ke10, 1, (1 << (10 << 1)) * sizeof(KmerEntryArr), fp);
err_fwrite(kh->ke11, 1, (1 << (11 << 1)) * sizeof(KmerEntry), fp);
err_fwrite(kh->ke12, 1, (1 << (12 << 1)) * sizeof(KmerEntry), fp);
#if HASH_KMER_LEN > 12
err_fwrite(kh->ke13, 1, (1 << (13 << 1)) * sizeof(KmerEntry), fp);
#endif
#if HASH_KMER_LEN > 13
err_fwrite(kh->ke14, 1, (1 << (14 << 1)) * sizeof(KmerEntry), fp);
#endif
err_fflush(fp);
err_fclose(fp);
}
@ -158,6 +163,156 @@ KmerHash fmt_restore_kmer_idx(const char *fn)
return khash;
}
// 生成所有KMER_LEN长度的序列字符串表示
void gen_kmer_base(uint8_t **seq_arr, uint64_t *kmer_arr_size, int kmer_len)
{
uint64_t i;
uint64_t seq_up_val = (1 << (kmer_len << 1));
*kmer_arr_size = (uint64_t)seq_up_val;
*seq_arr = realloc(*seq_arr, seq_up_val * (uint64_t)kmer_len);
for (i = 0; i < seq_up_val; ++i)
{
const uint64_t base_idx = i * kmer_len;
for (int j = kmer_len - 1; j >= 0; --j)
{
(*seq_arr)[base_idx + kmer_len - 1 - j] = (i >> (j << 1)) & 3;
}
}
}
uint64_t global_num = 0;
// 利用fmt搜索seed完整搜索只需要单向搜索
bwtintv_t fmt_search(FMTIndex *fmt, const uint8_t *q, int qlen)
{
bwtintv_t ik;
bwtintv_t ok1;
bwtintv_t ok2;
int i, c1, c2, x = 0;
fmt_set_intv(fmt, q[x], ik);
ik.info = x + 1;
for (i = x + 1; i + 1 < qlen; i += 2)
{
if (q[i] < 4 && q[i + 1] < 4)
{
c1 = 3 - q[i];
c2 = 3 - q[i + 1];
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, c1, c2);
ik = ok2;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && q[i] < 4)
{ // 最后一次扩展
c1 = 3 - q[i];
fmt_extend1(fmt, &ik, &ok1, 0, c1);
//if (qlen == 14) fprintf(stderr, "%ld %ld %ld\n", ok1.x[0], ok1.x[1], ok1.x[2]);
//if (qlen == 14) ++global_num;
//if (qlen == 14 && global_num % 10000 == 0) fprintf(stderr, "%ld\n", global_num);
ik = ok1;
ik.info = i + 1;
}
return ik;
}
// 创建xmer hash索引
void fmt_create_kmer_index(FMTIndex *fmt) {
uint64_t kmer_arr_size = 0;
uint8_t *seq_arr = 0;
gen_kmer_base(&seq_arr, &kmer_arr_size, 10);
bwtintv_t ik;
uint64_t j;
int i, c1, c2;
int qlen = 10;
bwtint_t tk[4], tl[4];
KmerHash *kh = &fmt->kmer_hash;
kh->ke10 = (KmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(KmerEntryArr));
kh->ke11 = (KmerEntry *)malloc((1 << (11 << 1)) * sizeof(KmerEntry));
kh->ke12 = (KmerEntry *)malloc((1 << (12 << 1)) * sizeof(KmerEntry));
kh->ke13 = (KmerEntry *)malloc((1 << (13 << 1)) * sizeof(KmerEntry));
kh->ke14 = (KmerEntry *)malloc((1 << (14 << 1)) * sizeof(KmerEntry));
// 长度为10的kmer
for (j = 0; j < kmer_arr_size; ++j)
{
uint8_t *q = &seq_arr[j * 10];
uint8_t *mem_addr = kh->ke10[j].intv_arr;
fmt_set_intv(fmt, q[0], ik);
kmer_setval_at(mem_addr, ik, 0);
// 每次扩展两个碱基
for (i = 1; i + 1 < qlen; i += 2)
{
c1 = 3 - q[i];
c2 = 3 - q[i + 1];
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
kmer_setval_at(mem_addr, ik, i);
// 第二次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
ik.x[2] = tl[3] - tk[3];
kmer_setval_at(mem_addr, ik, i + 1);
}
{ // 最后一次扩展
c1 = 3 - q[i];
c2 = 3;
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
kmer_setval_at(mem_addr, ik, i);
}
}
// 长度11的kmer
gen_kmer_base(&seq_arr, &kmer_arr_size, 11);
for (j = 0; j < kmer_arr_size; ++j)
{
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 11], 11);
kmer_setval_at(fmt->kmer_hash.ke11[j].intv_arr, p, 0);
}
// 长度12的kmer
gen_kmer_base(&seq_arr, &kmer_arr_size, 12);
for (j = 0; j < kmer_arr_size; ++j)
{
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 12], 12);
kmer_setval_at(fmt->kmer_hash.ke12[j].intv_arr, p, 0);
}
gen_kmer_base(&seq_arr, &kmer_arr_size, 13);
for (j = 0; j < kmer_arr_size; ++j)
{
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 13], 13);
kmer_setval_at(fmt->kmer_hash.ke13[j].intv_arr, p, 0);
}
// 长度14的kmer
gen_kmer_base(&seq_arr, &kmer_arr_size, 14);
//fprintf(stderr, "14-size:%ld\n", kmer_arr_size);
for (j = 0; j < kmer_arr_size; ++j)
{
//if (j % 10000 == 0) fprintf(stderr, "arr_size: %ld, %ld\n", j, kmer_arr_size);
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 14], 14);
kmer_setval_at(fmt->kmer_hash.ke14[j].intv_arr, p, 0);
}
free(seq_arr);
}
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
{
@ -241,7 +396,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
if (i % 16 == 0) // 每个32位整数可以包含16个碱基每次需要处理16个碱基也就是间隔最小可以设置为16
{
uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8;//4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息
int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址
uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理
@ -335,7 +490,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16);
k += 16;
xassert(k == fmt->bwt_size, "inconsistent bwt_size");
xassert(k == fmt->bwt_size, "inconsistent fmt_size");
// update fmt
fmt->bwt = buf;
return fmt;
@ -708,10 +863,18 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
#if 1
if (min_intv == 1 && ok2.x[2] == min_intv)
{
// fprintf(gfp1, "%d\t", i + 2 - x);
#ifdef DEBUG_OUTPUT
#ifdef COUNT_SEED_LENGTH
fprintf(gfp1, "%d\t", i + 2 - x);
#endif
#endif
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
//mt.x[0] = ok2.x[0];
//fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
#ifdef DEBUG_OUTPUT
#if 0
fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
#endif
#endif
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
goto fmt_smem_forward_end;

View File

@ -80,6 +80,7 @@ typedef struct
void dump_fmt(const char *fn, const FMTIndex *fmt);
// 从文件中读取fmt结构数据
FMTIndex *fmt_restore_fmt(const char *fn);
void fmt_create_kmer_index(FMTIndex *fmt);
// 将kmer hash数据写入到文件
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
// 从文件中读取kmer hash信息
@ -100,7 +101,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *
void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len);
// void gen_all_seq(char **seq_arr, int kmer_len);
// 设置kmer第pos个碱基对应的fmt匹配信息
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
// 获取kmer的fmt匹配信息

View File

@ -5,6 +5,7 @@
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "utils.h"
#ifdef DEBUG_OUTPUT
extern FILE *gfp1, *gfp2, *gfp3;
@ -21,13 +22,13 @@ extern FILE *gfq[4], *gft[4], *gfi[4];
#define UNLIKELY(x) (x)
#endif
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
//#undef MAX
//#undef MIN
//#define MAX(x, y) ((x) > (y) ? (x) : (y))
//#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define SIMD_WIDTH 16
typedef struct { size_t m; uint8_t *addr; } buf_t;
// typedef struct { size_t m; uint8_t *addr; } buf_t;
extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf);
@ -175,6 +176,7 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
@ -191,6 +193,7 @@ void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
#endif
}
int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
@ -221,6 +224,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
#ifdef DEBUG_OUTPUT
//fprintf(gfp1, "%d\n", qlen);
#ifdef GET_DIFFERENT_EXTENSION_LENGTH
if (qlen <= 30) {
write_query_target_sequence(qlen, query, tlen, target, h0, 0);
} else if (qlen < 60) {
@ -230,6 +234,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
} else {
write_query_target_sequence(qlen, query, tlen, target, h0, 3);
}
#endif
#endif
if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf);
@ -654,7 +659,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
#endif
#ifdef DEBUG_OUTPUT
// fprintf(gfp1, "start\n");
#ifdef COUNT_CALC_NUM
int bsw_cal_num = 0;
int real_cal_num = 0;
for (i = 0; i < tlen; ++i)
@ -664,7 +669,12 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
if (beg >= end) break;
bsw_cal_num += end - beg;
}
// fprintf(gfp1, "%d\n", bsw_cal_num);
fprintf(gfp1, "start\n%d\n", bsw_cal_num);
#endif
#endif
#ifdef ELIMINATE_DIFF_3
int prun_end = qlen; // for test diff_3
#endif
for (i = 0; LIKELY(i < tlen); ++i) {
@ -683,8 +693,10 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
for (j = beg; LIKELY(j < end); ++j) {
#ifdef DEBUG_OUTPUT
#ifdef COUNT_CALC_NUM
real_cal_num++;
#endif
#endif
#ifdef DEBUG_SW_EXTEND
ins[i+1][j+1] = f;
@ -700,6 +712,9 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0sw和nw的区别
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
h = h > f? h : f;
#ifdef ELIMINATE_DIFF_3
if (j >= prun_end && h==0) break; // for test diff_3
#endif
h1 = h; // save H(i,j) to h1 for the next column
#ifdef DEBUG_SW_EXTEND
@ -710,10 +725,13 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
t = M - oe_del;
t = t > 0? t : 0;
e -= e_del;
#ifdef DEBUG_SW_EXTEND
del[i + 1][j + 1] = e;
#endif
e = e > t? e : t; // computed E(i+1,j)
#ifdef DEBUG_SW_EXTEND
del[i+1][j+1] = e;
// del[i+1][j+1] = e;
#endif
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
@ -742,11 +760,15 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑finsert score
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
#ifdef ELIMINATE_DIFF_3
prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3
#else
end = j + 2 < qlen? j + 2 : qlen;
//beg = 0; end = qlen; // uncomment this line for debugging
//if (print_flag) {
#endif
// beg = 0; end = qlen; // uncomment this line for debugging
// if (print_flag) {
// fprintf(stderr, "beg: %d; end: %d\n", beg, end);
//}
// }
}
#ifdef DEBUG_OUTPUT
#ifdef DEBUG_SW_EXTEND
@ -797,7 +819,9 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
#endif
#ifdef DEBUG_OUTPUT
//fprintf(gfp1, "%d\nend\n", real_cal_num);
#ifdef COUNT_CALC_NUM
fprintf(gfp1, "%d\nend\n", real_cal_num);
#endif
#endif
free(eh); free(qp); free(qmem);

13
main.c
View File

@ -30,7 +30,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.17-r1198-dirty"
#define PACKAGE_VERSION "v1.0-0.7.17"
#endif
int bwa_fa2pac(int argc, char *argv[]);
@ -38,6 +38,8 @@ int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
int bwa_bwt2bytesa(int argc, char *argv[]);
int bwa_bwt2fmt(int argc, char *argv[]);
int bwa_build_kmer(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
@ -57,10 +59,11 @@ int main_maxk(int argc, char *argv[]);
static int usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n");
fprintf(stderr, "Program: fast-bwa-mem (alignment via Burrows-Wheeler transformation)\n");
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu> (for bwa)\n\n");
fprintf(stderr, "Contact: Zhonghai Zhang <zhangzhonghai@ict.ac.cn> (for fast-bwa-mem)\n\n");
fprintf(stderr, "Usage: fastbwa <command> [options]\n\n");
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
@ -103,6 +106,8 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2fmt") == 0) ret = bwa_bwt2fmt(argc-1, argv+1);
else if (strcmp(argv[1], "buildkmer") == 0) ret = bwa_build_kmer(argc - 1, argv + 1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);

2
maxk.c
View File

@ -23,7 +23,7 @@ int main_maxk(int argc, char *argv[])
if (c == 's') self = 1;
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa maxk [-s] <index.prefix> <seq.fa>\n");
fprintf(stderr, "Usage: fastbwa maxk [-s] <index.prefix> <seq.fa>\n");
return 1;
}
fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb");

View File

@ -238,7 +238,7 @@ int main_pemerge(int argc, char *argv[])
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
fprintf(stderr, "Usage: fastbwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
fprintf(stderr, "Options: -m output merged reads only\n");
fprintf(stderr, " -u output unmerged reads only\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);

74
run.sh
View File

@ -1,63 +1,23 @@
thread=1
#n_r1=~/data/fastq/dataset/na12878_wgs_150/bn1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_150/bn2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
## d1
n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq
n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq
## d2
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
# d3
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_101/na_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/na_2.fq
n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
#n_r1=~/data/fastq/dataset/zy_wes/bloodgDNA_r1.fastq
#n_r2=~/data/fastq/dataset/zy_wes/bloodgDNA_r2.fastq
## d4
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
## d5
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
#n_r1=~/data/fastq/zy/wgs/n1.fq
#n_r2=~/data/fastq/zy/wgs/n2.fq
#n_r1=~/data/fastq/ds/n1.fq
#n_r2=~/data/fastq/ds/n2.fq
#n_r1=~/data/fastq/platinum/n1.fq
#n_r2=~/data/fastq/platinum/n2.fq
#n_r1=~/data/fastq/zy/t1.fq
#n_r2=~/data/fastq/zy/t2.fq
#n_r1=~/data/fastq/zy/n1.fq
#n_r2=~/data/fastq/zy/n2.fq
#n_r1=~/data/fastq/ds/d1_1.fq
#n_r2=~/data/fastq/ds/d1_2.fq
#n_r1=~/data/fastq/ds/d2_1.fq
#n_r2=~/data/fastq/ds/d2_2.fq
#n_r1=~/data/fastq/ds/wes/n1.fq
#n_r2=~/data/fastq/ds/wes/n2.fq
#n_r1=~/fastq/ZY2105177532213010_L4_1.fq
#n_r2=~/fastq/ZY2105177532213010_L4_2.fq
#n_r1=~/data/fastq/na12878/nas_1.fq
#n_r2=~/data/fastq/na12878/nas_2.fq
#n_r1=~/data/fastq/na12878/na_1.fq
#n_r2=~/data/fastq/na12878/na_2.fq
#n_r1=~/fastq/na12878/na12878_r1.fq
#n_r2=~/fastq/na12878/na12878_r2.fq
#n_r1=~/fastq/n_r1.fq
#n_r2=~/fastq/n_r2.fq
#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#n_r1=~/data/fastq/sn_r1.fq
#n_r2=~/data/fastq/sn_r2.fq
#n_r1=~/data/fastq/ssn_r1.fq
#n_r2=~/data/fastq/ssn_r2.fq
#n_r1=~/fastq/ERR194147_1_120w.fastq
#n_r2=~/fastq/ERR194147_2_120w.fastq
#n_r1=~/fastq/tiny_n_r1.fq
#n_r2=~/fastq/tiny_n_r2.fq
#n_r1=~/fastq/diff_r1.fq
#n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/d_r1.fq
#n_r2=~/fastq/d_r2.fq
reference=~/reference/human_g1k_v37_decoy.fasta
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
reference=~/data/reference/human_g1k_v37_decoy.fasta
#out=./all.sam
#out=./sn.sam
#out=./ssn-x1.sam
@ -75,7 +35,7 @@ out=/dev/null
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
# -o /dev/null
time ./bwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
time ./fastbwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
$reference \
$n_r1 \
$n_r2 \

12
utils.h
View File

@ -31,6 +31,17 @@
#include <stdio.h>
#include <zlib.h>
// for debug and test
//#define DEBUG_OUTPUT // 打开gfp1-4文件并记录debug信息
//#define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度以及最终扩展的长度
//#define GET_FULL_MATCH_READ // 获取完全匹配的reads
//#define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的querytarget和其他用于计算的数据
//#define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
////////////////////
#define USE_RDTSC 1
#ifdef SHOW_PERF
@ -96,6 +107,7 @@ typedef struct {
} pair64_t;
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; uint32_t *a; } uint32_v;
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
typedef struct { size_t m; uint8_t *addr; } buf_t;