From 20e072f6af4f23feebc50647eb3d351c4b866281 Mon Sep 17 00:00:00 2001 From: zzh Date: Tue, 2 Apr 2024 07:42:37 +0800 Subject: [PATCH] =?UTF-8?q?=E6=89=80=E6=9C=89=E4=BB=A3=E7=A0=81=E9=83=BD?= =?UTF-8?q?=E5=90=88=E5=B9=B6=E4=BA=86=EF=BC=8C=E8=BF=98=E5=B7=AE=E4=B8=80?= =?UTF-8?q?=E7=82=B9=E5=BB=BA=E7=AB=8B=E7=B4=A2=E5=BC=95=E7=9A=84=E6=97=B6?= =?UTF-8?q?=E5=80=99=EF=BC=8C=E4=B8=80=E8=B5=B7=E9=83=BD=E5=BB=BA=E7=AB=8B?= =?UTF-8?q?=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .vscode/launch.json | 19 +++- .vscode/settings.json | 8 +- Makefile | 10 +- bntseq.c | 2 +- bwa.c | 3 + bwamem.c | 18 +++- bwape.c | 2 +- bwase.c | 2 +- bwashm.c | 2 +- bwtaln.c | 2 +- bwtindex.c | 52 +++++++-- bwtsw2_main.c | 2 +- debug.sh | 2 +- fastmap.c | 4 +- fmt_idx.c | 197 ++++++++++++++++++++++++++++++++--- fmt_idx.h | 3 +- ksw_extend2_avx2.c | 48 ++++++--- main.c | 13 ++- maxk.c | 2 +- pemerge.c | 2 +- run.sh | 74 +++---------- bwa.sh => scripts/bwa.sh | 0 ert.sh => scripts/ert.sh | 0 mem2.sh => scripts/mem2.sh | 0 na.sh => scripts/na.sh | 0 sbwa.sh => scripts/sbwa.sh | 0 seed1.py => scripts/seed1.py | 0 seed2.py => scripts/seed2.py | 0 sw1.py => scripts/sw1.py | 0 utils.h | 12 +++ 31 files changed, 359 insertions(+), 121 deletions(-) rename bwa.sh => scripts/bwa.sh (100%) rename ert.sh => scripts/ert.sh (100%) rename mem2.sh => scripts/mem2.sh (100%) rename na.sh => scripts/na.sh (100%) rename sbwa.sh => scripts/sbwa.sh (100%) rename seed1.py => scripts/seed1.py (100%) rename seed2.py => scripts/seed2.py (100%) rename sw1.py => scripts/sw1.py (100%) diff --git a/.gitignore b/.gitignore index fe123fb..47e7738 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.fa dataset/ bwa +fastbwa test test64 .*.swp diff --git a/.vscode/launch.json b/.vscode/launch.json index 6bff252..b71f59c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,7 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/bwa", + "program": "${workspaceRoot}/fastbwa", "args": [ "mem", "-t", @@ -30,10 +30,23 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/bwa", + "program": "${workspaceRoot}/fastbwa", "args": [ "index", - "/mnt/d/data/reference/human_g1k_v37_decoy.fasta" + "~/data/reference/human_g1k_v37_decoy.fasta" + ], + "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 + }, + { + "name": "buildkmer", + "preLaunchTask": "Build", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceRoot}/fastbwa", + "args": [ + "buildkmer", + "~/data/reference/human_g1k_v37_decoy.fasta.256.64.fmt", + "~/data/reference/human_g1k_v37_decoy.fasta.kmer" ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 } diff --git a/.vscode/settings.json b/.vscode/settings.json index 9fbfc77..0121c4d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,6 +16,12 @@ "emmintrin.h": "c", "bwamem.h": "c", "utils.h": "c", - "stdio.h": "c" + "stdio.h": "c", + "kvec.h": "c", + "string.h": "c", + "stdlib.h": "c", + "array": "c", + "initializer_list": "c", + "utility": "c" } } \ No newline at end of file diff --git a/Makefile b/Makefile index 4ed8d89..461441c 100644 --- a/Makefile +++ b/Makefile @@ -6,13 +6,11 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF SHOW_DATA_PERF= #-DSHOW_DATA_PERF -DEBUG_OUTPUT= #-DDEBUG_OUTPUT -DEBUG_SW_EXTEND= #-DDEBUG_SW_EXTEND FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH USE_MT_READ= #-DUSE_MT_READ AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(DEBUG_OUTPUT) $(DEBUG_SW_EXTEND) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o yarn.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ @@ -20,11 +18,11 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o \ fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o -PROG= bwa +PROG= fastbwa INCLUDES= LIBS= -lm -lz -lpthread -ldl SUBDIRS= . -JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a # -ljemalloc -L/home/zzh/work/jemalloc/lib/ +JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a ifeq ($(shell uname -s),Linux) LIBS += -lrt @@ -40,7 +38,7 @@ all:$(PROG) #bwa:libbwa.a $(AOBJS) main.o # $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) -bwa:libbwa.a $(AOBJS) main.o +$(PROG):libbwa.a $(AOBJS) main.o $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) $(JE_MALLOC) main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o diff --git a/bntseq.c b/bntseq.c index c3c246a..6161aa7 100644 --- a/bntseq.c +++ b/bntseq.c @@ -342,7 +342,7 @@ int bwa_fa2pac(int argc, char *argv[]) } } if (argc == optind) { - fprintf(stderr, "Usage: bwa fa2pac [-f] []\n"); + fprintf(stderr, "Usage: fastbwa fa2pac [-f] []\n"); return 1; } fp = xzopen(argv[optind], "r"); diff --git a/bwa.c b/bwa.c index 4c43045..454bb8f 100644 --- a/bwa.c +++ b/bwa.c @@ -455,9 +455,11 @@ FMTIndex *bwa_idx_load_fmt(const char *hint) //kmer_bit_fn = malloc(l_hint + 32); sa_fn = malloc(l_hint + 32); sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL); +// sprintf(suffix, ".fmt"); strcpy(fmt_idx_fn, hint); strcpy(fmt_idx_fn + l_hint, suffix); sprintf(suffix, ".%d.kmer", HASH_KMER_LEN); +// sprintf(suffix, ".kmer"); strcpy(kmer_idx_fn, hint); strcpy(kmer_idx_fn + l_hint, suffix); @@ -473,6 +475,7 @@ FMTIndex *bwa_idx_load_fmt(const char *hint) strcpy(sa_fn, hint); sprintf(suffix, ".33.%d.sa", SA_INTV); +// sprintf(suffix, ".bytesa"); strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA) fmt_restore_sa(sa_fn, fmt); diff --git a/bwamem.c b/bwamem.c index ab32366..1844e59 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1447,9 +1447,17 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in while (x < len) { if (seq[x] < 4) { start_flag = 0; - //int last_x = x; +#ifdef DEBUG_OUTPUT +#ifdef COUNT_SEED_LENGTH + int last_x = x; +#endif +#endif x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]); - //fprintf(gfp1, "%d\n", x - last_x); +#ifdef DEBUG_OUTPUT +#ifdef COUNT_SEED_LENGTH + fprintf(gfp1, "%d\n", x - last_x); +#endif +#endif for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info >> 32); // seed length @@ -1458,7 +1466,9 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in kv_push(bwtintv_t, smem->mem, *p); } } - //break; // for test full match time +#ifdef COUNT_SEED_LENGTH + break; // for test full match time +#endif } else { ++x; if (start_flag) ++start_N_num; @@ -1486,9 +1496,11 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in #ifdef DEBUG_OUTPUT if (start_N_num == 0) { +#ifdef GET_FULL_MATCH_READ for (i = 0; i < len; ++i) fprintf(gfp1, "%c", "ACGT"[seq[i]]); fprintf(gfp1, "\n"); +#endif #ifdef SHOW_DATA_PERF gdat[2]++; if (gdat[2] % 100 == 0) { diff --git a/bwape.c b/bwape.c index fa4f7f7..7090485 100644 --- a/bwape.c +++ b/bwape.c @@ -757,7 +757,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) if (optind + 5 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa sampe [options] \n\n"); + fprintf(stderr, "Usage: fastbwa sampe [options] \n\n"); fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); diff --git a/bwase.c b/bwase.c index 18e8671..017890b 100644 --- a/bwase.c +++ b/bwase.c @@ -593,7 +593,7 @@ int bwa_sai2sam_se(int argc, char *argv[]) } if (optind + 3 > argc) { - fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); + fprintf(stderr, "Usage: fastbwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { diff --git a/bwashm.c b/bwashm.c index 163f764..e3f765c 100644 --- a/bwashm.c +++ b/bwashm.c @@ -186,7 +186,7 @@ int main_shm(int argc, char *argv[]) else if (c == 'f') tmpfn = optarg; } if (optind == argc && !to_list && !to_drop) { - fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n"); + fprintf(stderr, "\nUsage: fastbwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n"); fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); fprintf(stderr, " -l list names of indices in shared memory\n"); fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n"); diff --git a/bwtaln.c b/bwtaln.c index a348fdf..a3ccc51 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -273,7 +273,7 @@ int bwa_aln(int argc, char *argv[]) if (optind + 2 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa aln [options] \n\n"); + fprintf(stderr, "Usage: fastbwa aln [options] \n\n"); fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", BWA_AVG_ERR, opt->fnr); fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); diff --git a/bwtindex.c b/bwtindex.c index 70379d8..5973332 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -136,7 +136,7 @@ int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + fprintf(stderr, "Usage: fastbwa pac2bwt [-d] \n"); return 1; } bwt = bwt_pac2bwt(argv[optind], use_is); @@ -175,7 +175,7 @@ int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command { bwt_t *bwt; if (argc != 2) { - fprintf(stderr, "Usage: bwa bwtupdate \n"); + fprintf(stderr, "Usage: fastbwa bwtupdate \n"); return 1; } bwt = bwt_restore_bwt(argv[1]); @@ -196,7 +196,7 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + fprintf(stderr, "Usage: fastbwa bwt2sa [-i %d] \n", sa_intv); return 1; } bwt = bwt_restore_bwt(argv[optind]); @@ -206,7 +206,7 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command return 0; } -int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command +int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command { bwt_t *bwt; int c, sa_intv = 32; @@ -217,7 +217,7 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2bytesa [-i %d] \n", sa_intv); + fprintf(stderr, "Usage: fastbwa bwt2bytesa [-i %d] \n", sa_intv); return 1; } bwt = bwt_restore_bwt(argv[optind]); @@ -227,6 +227,37 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command return 0; } +int bwa_bwt2fmt(int argc, char *argv[]) // create fmt index +{ + bwt_t *bwt; + char buf[1024]; + if (optind + 1 > argc) { + fprintf(stderr, "Usage: fastbwa bwt2fmt \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + FMTIndex *fmt; + fmt = create_fmt_from_bwt(bwt); + dump_fmt(argv[optind + 1], fmt); + // sprintf(buf, "%s.kmer", argv[optind + 1]); + return 0; +} + +int bwa_build_kmer(int argc, char *argv[]) +{ + char buf[1024]; + if (optind + 1 > argc) + { + fprintf(stderr, "Usage: fastbwa build_kmerhash \n"); + return 1; + } + FMTIndex *fmt = fmt_restore_fmt(argv[optind]); + fmt_create_kmer_index(fmt); + sprintf(buf, "%s", argv[optind + 1]); + fmt_dump_kmer_idx(buf, &fmt->kmer_hash); + return 0; +} + int bwa_index(int argc, char *argv[]) // the "index" command { int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000; @@ -253,7 +284,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command if (optind + 1 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa index [options] \n\n"); + fprintf(stderr, "Usage: fastbwa index [options] \n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size); @@ -338,6 +369,15 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s bwt_dump_sa(str3, bwt); bwt_destroy(bwt); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + { + // build FMT-Index + FMTIndex *fmt; + strcpy(str, prefix); strcat(str, ".fmt"); + fmt = create_fmt_from_bwt(bwt); + dump_fmt(str, fmt); + // create Kmer-Hash + + } } free(str3); free(str2); free(str); return 0; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 40a9e0a..19d58ba 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -44,7 +44,7 @@ int bwa_bwtsw2(int argc, char *argv[]) if (optind + 2 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); + fprintf(stderr, "Usage: fastbwa bwasw [options] [query2.fa]\n\n"); fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); diff --git a/debug.sh b/debug.sh index 0534657..77d8bc6 100755 --- a/debug.sh +++ b/debug.sh @@ -35,7 +35,7 @@ out=./out.sam # /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \ # -o /dev/null -time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ $reference \ $n_r1 \ $n_r2 \ diff --git a/fastmap.c b/fastmap.c index af5ce14..13025c2 100644 --- a/fastmap.c +++ b/fastmap.c @@ -570,7 +570,7 @@ int main_mem(int argc, char *argv[]) if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); + fprintf(stderr, "Usage: fastbwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Algorithm options:\n\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -b INT batch size of reads to process at one time [%d]\n", opt->batch_size); @@ -849,7 +849,7 @@ int main_fastmap(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa fastmap [options] \n\n"); + fprintf(stderr, "Usage: fastbwa fastmap [options] \n\n"); fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len); fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth); fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv); diff --git a/fmt_idx.c b/fmt_idx.c index d27d0f3..ed9b1f0 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -15,22 +15,23 @@ Date : 2023/12/24 #include "utils.h" #include "bntseq.h" #include "kvec.h" +#include "kstring.h" -const static char BASE[4] = {'A', 'C', 'G', 'T'}; +//const static char BASE[4] = {'A', 'C', 'G', 'T'}; // 生成所有KMER_LEN长度的序列,字符串表示 -void gen_all_seq(char **seq_arr, int kmer_len) -{ - uint32_t seq_up_val = (1 << (kmer_len << 1)); - for (uint32_t i = 0; i < seq_up_val; ++i) - { - seq_arr[i] = (char *)malloc(kmer_len); - for (int j = kmer_len - 1; j >= 0; --j) - { - seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; - } - } -} +//void gen_all_seq(char **seq_arr, int kmer_len) +//{ +// uint32_t seq_up_val = (1 << (kmer_len << 1)); +// for (uint32_t i = 0; i < seq_up_val; ++i) +// { +// seq_arr[i] = (char *)malloc(kmer_len); +// for (int j = kmer_len - 1; j >= 0; --j) +// { +// seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; +// } +// } +//} // 生成occ,每个字节对应一个pattern void fmt_gen_cnt_occ(FMTIndex *fmt) @@ -123,8 +124,12 @@ void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh) err_fwrite(kh->ke10, 1, (1 << (10 << 1)) * sizeof(KmerEntryArr), fp); err_fwrite(kh->ke11, 1, (1 << (11 << 1)) * sizeof(KmerEntry), fp); err_fwrite(kh->ke12, 1, (1 << (12 << 1)) * sizeof(KmerEntry), fp); +#if HASH_KMER_LEN > 12 err_fwrite(kh->ke13, 1, (1 << (13 << 1)) * sizeof(KmerEntry), fp); +#endif +#if HASH_KMER_LEN > 13 err_fwrite(kh->ke14, 1, (1 << (14 << 1)) * sizeof(KmerEntry), fp); +#endif err_fflush(fp); err_fclose(fp); } @@ -158,6 +163,156 @@ KmerHash fmt_restore_kmer_idx(const char *fn) return khash; } +// 生成所有KMER_LEN长度的序列,字符串表示 +void gen_kmer_base(uint8_t **seq_arr, uint64_t *kmer_arr_size, int kmer_len) +{ + uint64_t i; + uint64_t seq_up_val = (1 << (kmer_len << 1)); + *kmer_arr_size = (uint64_t)seq_up_val; + *seq_arr = realloc(*seq_arr, seq_up_val * (uint64_t)kmer_len); + for (i = 0; i < seq_up_val; ++i) + { + const uint64_t base_idx = i * kmer_len; + for (int j = kmer_len - 1; j >= 0; --j) + { + (*seq_arr)[base_idx + kmer_len - 1 - j] = (i >> (j << 1)) & 3; + } + } +} + +uint64_t global_num = 0; + +// 利用fmt搜索seed,完整搜索,只需要单向搜索 +bwtintv_t fmt_search(FMTIndex *fmt, const uint8_t *q, int qlen) +{ + bwtintv_t ik; + bwtintv_t ok1; + bwtintv_t ok2; + int i, c1, c2, x = 0; + + fmt_set_intv(fmt, q[x], ik); + ik.info = x + 1; + for (i = x + 1; i + 1 < qlen; i += 2) + { + if (q[i] < 4 && q[i + 1] < 4) + { + c1 = 3 - q[i]; + c2 = 3 - q[i + 1]; + fmt_extend2(fmt, &ik, &ok1, &ok2, 0, c1, c2); + ik = ok2; + ik.info = i + 1; + } + else + { + break; + } + } + + if (i < qlen && q[i] < 4) + { // 最后一次扩展 + c1 = 3 - q[i]; + fmt_extend1(fmt, &ik, &ok1, 0, c1); + //if (qlen == 14) fprintf(stderr, "%ld %ld %ld\n", ok1.x[0], ok1.x[1], ok1.x[2]); + //if (qlen == 14) ++global_num; + //if (qlen == 14 && global_num % 10000 == 0) fprintf(stderr, "%ld\n", global_num); + ik = ok1; + ik.info = i + 1; + } + return ik; +} + +// 创建xmer hash索引 +void fmt_create_kmer_index(FMTIndex *fmt) { + uint64_t kmer_arr_size = 0; + uint8_t *seq_arr = 0; + gen_kmer_base(&seq_arr, &kmer_arr_size, 10); + bwtintv_t ik; + uint64_t j; + int i, c1, c2; + int qlen = 10; + bwtint_t tk[4], tl[4]; + KmerHash *kh = &fmt->kmer_hash; + + kh->ke10 = (KmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(KmerEntryArr)); + kh->ke11 = (KmerEntry *)malloc((1 << (11 << 1)) * sizeof(KmerEntry)); + kh->ke12 = (KmerEntry *)malloc((1 << (12 << 1)) * sizeof(KmerEntry)); + kh->ke13 = (KmerEntry *)malloc((1 << (13 << 1)) * sizeof(KmerEntry)); + kh->ke14 = (KmerEntry *)malloc((1 << (14 << 1)) * sizeof(KmerEntry)); + + // 长度为10的kmer + for (j = 0; j < kmer_arr_size; ++j) + { + uint8_t *q = &seq_arr[j * 10]; + uint8_t *mem_addr = kh->ke10[j].intv_arr; + fmt_set_intv(fmt, q[0], ik); + kmer_setval_at(mem_addr, ik, 0); + + // 每次扩展两个碱基 + for (i = 1; i + 1 < qlen; i += 2) + { + c1 = 3 - q[i]; + c2 = 3 - q[i + 1]; + + fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); + fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); + // 第一次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + ik.x[1] = fmt->L2[c1] + 1 + tk[1]; + ik.x[2] = tl[1] - tk[1]; + kmer_setval_at(mem_addr, ik, i); + + // 第二次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2]; + ik.x[1] = fmt->L2[c2] + 1 + tk[3]; + ik.x[2] = tl[3] - tk[3]; + kmer_setval_at(mem_addr, ik, i + 1); + } + { // 最后一次扩展 + c1 = 3 - q[i]; + c2 = 3; + fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); + fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); + // 第一次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + ik.x[1] = fmt->L2[c1] + 1 + tk[1]; + ik.x[2] = tl[1] - tk[1]; + kmer_setval_at(mem_addr, ik, i); + } + } + // 长度11的kmer + gen_kmer_base(&seq_arr, &kmer_arr_size, 11); + for (j = 0; j < kmer_arr_size; ++j) + { + bwtintv_t p = fmt_search(fmt, &seq_arr[j * 11], 11); + kmer_setval_at(fmt->kmer_hash.ke11[j].intv_arr, p, 0); + } + + // 长度12的kmer + gen_kmer_base(&seq_arr, &kmer_arr_size, 12); + for (j = 0; j < kmer_arr_size; ++j) + { + bwtintv_t p = fmt_search(fmt, &seq_arr[j * 12], 12); + kmer_setval_at(fmt->kmer_hash.ke12[j].intv_arr, p, 0); + } + gen_kmer_base(&seq_arr, &kmer_arr_size, 13); + for (j = 0; j < kmer_arr_size; ++j) + { + bwtintv_t p = fmt_search(fmt, &seq_arr[j * 13], 13); + kmer_setval_at(fmt->kmer_hash.ke13[j].intv_arr, p, 0); + } + + // 长度14的kmer + gen_kmer_base(&seq_arr, &kmer_arr_size, 14); + //fprintf(stderr, "14-size:%ld\n", kmer_arr_size); + for (j = 0; j < kmer_arr_size; ++j) + { + //if (j % 10000 == 0) fprintf(stderr, "arr_size: %ld, %ld\n", j, kmer_arr_size); + bwtintv_t p = fmt_search(fmt, &seq_arr[j * 14], 14); + kmer_setval_at(fmt->kmer_hash.ke14[j].intv_arr, p, 0); + } + free(seq_arr); +} + // 读取sa数据 void fmt_restore_sa(const char *fn, FMTIndex *fmt) { @@ -241,7 +396,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt) if (i % 16 == 0) // 每个32位整数可以包含16个碱基,每次需要处理16个碱基,也就是间隔最小可以设置为16 { uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串 - uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64,bwt字符串i对应的基准行,因为原始的bwt-cp(check point)包含由4个uint32_t(8个uint32_t)组成的occ信息 + uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8;//4; // 这里加4还是加8要看保存occ的是是uint32还是uint64,bwt字符串i对应的基准行,因为原始的bwt-cp(check point)包含由4个uint32_t(8个uint32_t)组成的occ信息 int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址,每16个碱基共用一个uint32整型,因此需要偏移量来获取当前碱基串的首地址 uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址 for (j = 0; j < 16; ++j) // 对于bwt碱基串,一个一个碱基分别处理 @@ -335,7 +490,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt) k += 4; memcpy(buf + k, c2, sizeof(uint32_t) * 16); k += 16; - xassert(k == fmt->bwt_size, "inconsistent bwt_size"); + xassert(k == fmt->bwt_size, "inconsistent fmt_size"); // update fmt fmt->bwt = buf; return fmt; @@ -708,10 +863,18 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int #if 1 if (min_intv == 1 && ok2.x[2] == min_intv) { - // fprintf(gfp1, "%d\t", i + 2 - x); +#ifdef DEBUG_OUTPUT +#ifdef COUNT_SEED_LENGTH + fprintf(gfp1, "%d\t", i + 2 - x); +#endif +#endif direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); //mt.x[0] = ok2.x[0]; - //fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]); +#ifdef DEBUG_OUTPUT +#if 0 + fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]); +#endif +#endif kv_push(bwtintv_t, *mem, mt); ret = (uint32_t)mt.info; goto fmt_smem_forward_end; diff --git a/fmt_idx.h b/fmt_idx.h index 0d6a5f9..3f07851 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -80,6 +80,7 @@ typedef struct void dump_fmt(const char *fn, const FMTIndex *fmt); // 从文件中读取fmt结构数据 FMTIndex *fmt_restore_fmt(const char *fn); +void fmt_create_kmer_index(FMTIndex *fmt); // 将kmer hash数据写入到文件 void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh); // 从文件中读取kmer hash信息 @@ -100,7 +101,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t * void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1); void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1); // 生成所有KMER_LEN长度的序列,字符串表示 -void gen_all_seq(char **seq_arr, int kmer_len); +// void gen_all_seq(char **seq_arr, int kmer_len); // 设置kmer第pos个碱基对应的fmt匹配信息 void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos); // 获取kmer的fmt匹配信息 diff --git a/ksw_extend2_avx2.c b/ksw_extend2_avx2.c index 7c3c1ca..5903b5c 100644 --- a/ksw_extend2_avx2.c +++ b/ksw_extend2_avx2.c @@ -5,6 +5,7 @@ #include #include #include +#include "utils.h" #ifdef DEBUG_OUTPUT extern FILE *gfp1, *gfp2, *gfp3; @@ -21,13 +22,13 @@ extern FILE *gfq[4], *gft[4], *gfi[4]; #define UNLIKELY(x) (x) #endif -#undef MAX -#undef MIN -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) < (y) ? (x) : (y)) +//#undef MAX +//#undef MIN +//#define MAX(x, y) ((x) > (y) ? (x) : (y)) +//#define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 16 -typedef struct { size_t m; uint8_t *addr; } buf_t; +// typedef struct { size_t m; uint8_t *addr; } buf_t; extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); @@ -175,6 +176,7 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum) { +#ifdef DEBUG_OUTPUT // 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen FILE *query_f = gfq[fnum], *target_f = gft[fnum], @@ -191,6 +193,7 @@ void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const fprintf(target_f, "\n"); // 处理其他信息 fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0); +#endif } int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 @@ -221,6 +224,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 #ifdef DEBUG_OUTPUT //fprintf(gfp1, "%d\n", qlen); +#ifdef GET_DIFFERENT_EXTENSION_LENGTH if (qlen <= 30) { write_query_target_sequence(qlen, query, tlen, target, h0, 0); } else if (qlen < 60) { @@ -230,6 +234,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 } else { write_query_target_sequence(qlen, query, tlen, target, h0, 3); } +#endif #endif if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf); @@ -654,7 +659,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 #endif #ifdef DEBUG_OUTPUT - // fprintf(gfp1, "start\n"); +#ifdef COUNT_CALC_NUM int bsw_cal_num = 0; int real_cal_num = 0; for (i = 0; i < tlen; ++i) @@ -664,7 +669,12 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 if (beg >= end) break; bsw_cal_num += end - beg; } - // fprintf(gfp1, "%d\n", bsw_cal_num); + fprintf(gfp1, "start\n%d\n", bsw_cal_num); +#endif +#endif + +#ifdef ELIMINATE_DIFF_3 + int prun_end = qlen; // for test diff_3 #endif for (i = 0; LIKELY(i < tlen); ++i) { @@ -683,8 +693,10 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 for (j = beg; LIKELY(j < end); ++j) { #ifdef DEBUG_OUTPUT +#ifdef COUNT_CALC_NUM real_cal_num++; #endif +#endif #ifdef DEBUG_SW_EXTEND ins[i+1][j+1] = f; @@ -700,6 +712,9 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0,sw和nw的区别 h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 h = h > f? h : f; +#ifdef ELIMINATE_DIFF_3 + if (j >= prun_end && h==0) break; // for test diff_3 +#endif h1 = h; // save H(i,j) to h1 for the next column #ifdef DEBUG_SW_EXTEND @@ -710,10 +725,13 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 t = M - oe_del; t = t > 0? t : 0; e -= e_del; +#ifdef DEBUG_SW_EXTEND + del[i + 1][j + 1] = e; +#endif e = e > t? e : t; // computed E(i+1,j) #ifdef DEBUG_SW_EXTEND - del[i+1][j+1] = e; +// del[i+1][j+1] = e; #endif p->e = e; // save E(i+1,j) for the next row t = M - oe_ins; @@ -742,11 +760,15 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑f(insert score) beg = j; for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); +#ifdef ELIMINATE_DIFF_3 + prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3 +#else end = j + 2 < qlen? j + 2 : qlen; - //beg = 0; end = qlen; // uncomment this line for debugging - //if (print_flag) { +#endif + // beg = 0; end = qlen; // uncomment this line for debugging + // if (print_flag) { // fprintf(stderr, "beg: %d; end: %d\n", beg, end); - //} + // } } #ifdef DEBUG_OUTPUT #ifdef DEBUG_SW_EXTEND @@ -797,7 +819,9 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 #endif #ifdef DEBUG_OUTPUT - //fprintf(gfp1, "%d\nend\n", real_cal_num); +#ifdef COUNT_CALC_NUM + fprintf(gfp1, "%d\nend\n", real_cal_num); +#endif #endif free(eh); free(qp); free(qmem); diff --git a/main.c b/main.c index 25d5dc7..86b8ab1 100644 --- a/main.c +++ b/main.c @@ -30,7 +30,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.17-r1198-dirty" +#define PACKAGE_VERSION "v1.0-0.7.17" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -38,6 +38,8 @@ int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); int bwa_bwt2sa(int argc, char *argv[]); int bwa_bwt2bytesa(int argc, char *argv[]); +int bwa_bwt2fmt(int argc, char *argv[]); +int bwa_build_kmer(int argc, char *argv[]); int bwa_index(int argc, char *argv[]); int bwt_bwtgen_main(int argc, char *argv[]); @@ -57,10 +59,11 @@ int main_maxk(int argc, char *argv[]); static int usage() { fprintf(stderr, "\n"); - fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n"); + fprintf(stderr, "Program: fast-bwa-mem (alignment via Burrows-Wheeler transformation)\n"); fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); - fprintf(stderr, "Contact: Heng Li \n\n"); - fprintf(stderr, "Usage: bwa [options]\n\n"); + fprintf(stderr, "Contact: Heng Li (for bwa)\n\n"); + fprintf(stderr, "Contact: Zhonghai Zhang (for fast-bwa-mem)\n\n"); + fprintf(stderr, "Usage: fastbwa [options]\n\n"); fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); @@ -103,6 +106,8 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2fmt") == 0) ret = bwa_bwt2fmt(argc-1, argv+1); + else if (strcmp(argv[1], "buildkmer") == 0) ret = bwa_build_kmer(argc - 1, argv + 1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); diff --git a/maxk.c b/maxk.c index fee5765..8c86dc7 100644 --- a/maxk.c +++ b/maxk.c @@ -23,7 +23,7 @@ int main_maxk(int argc, char *argv[]) if (c == 's') self = 1; } if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa maxk [-s] \n"); + fprintf(stderr, "Usage: fastbwa maxk [-s] \n"); return 1; } fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb"); diff --git a/pemerge.c b/pemerge.c index 197e87f..ca42aca 100644 --- a/pemerge.c +++ b/pemerge.c @@ -238,7 +238,7 @@ int main_pemerge(int argc, char *argv[]) if (optind == argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); + fprintf(stderr, "Usage: fastbwa pemerge [-mu] [read2.fq]\n\n"); fprintf(stderr, "Options: -m output merged reads only\n"); fprintf(stderr, " -u output unmerged reads only\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); diff --git a/run.sh b/run.sh index 038603d..2c93b5c 100755 --- a/run.sh +++ b/run.sh @@ -1,63 +1,23 @@ thread=1 -#n_r1=~/data/fastq/dataset/na12878_wgs_150/bn1.fq -#n_r2=~/data/fastq/dataset/na12878_wgs_150/bn2.fq -#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq -#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq + +## d1 +n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq +n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq +## d2 +#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq +#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq +# d3 #n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq #n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq -#n_r1=~/data/fastq/dataset/na12878_wgs_101/na_1.fq -#n_r2=~/data/fastq/dataset/na12878_wgs_101/na_2.fq -n_r1=~/data/fastq/dataset/zy_wes/s_1.fq -n_r2=~/data/fastq/dataset/zy_wes/s_2.fq -#n_r1=~/data/fastq/dataset/zy_wes/bloodgDNA_r1.fastq -#n_r2=~/data/fastq/dataset/zy_wes/bloodgDNA_r2.fastq +## d4 +#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq +#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq +## d5 +#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq +#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq - -#n_r1=~/data/fastq/zy/wgs/n1.fq -#n_r2=~/data/fastq/zy/wgs/n2.fq -#n_r1=~/data/fastq/ds/n1.fq -#n_r2=~/data/fastq/ds/n2.fq -#n_r1=~/data/fastq/platinum/n1.fq -#n_r2=~/data/fastq/platinum/n2.fq -#n_r1=~/data/fastq/zy/t1.fq -#n_r2=~/data/fastq/zy/t2.fq -#n_r1=~/data/fastq/zy/n1.fq -#n_r2=~/data/fastq/zy/n2.fq -#n_r1=~/data/fastq/ds/d1_1.fq -#n_r2=~/data/fastq/ds/d1_2.fq -#n_r1=~/data/fastq/ds/d2_1.fq -#n_r2=~/data/fastq/ds/d2_2.fq -#n_r1=~/data/fastq/ds/wes/n1.fq -#n_r2=~/data/fastq/ds/wes/n2.fq -#n_r1=~/fastq/ZY2105177532213010_L4_1.fq -#n_r2=~/fastq/ZY2105177532213010_L4_2.fq -#n_r1=~/data/fastq/na12878/nas_1.fq -#n_r2=~/data/fastq/na12878/nas_2.fq -#n_r1=~/data/fastq/na12878/na_1.fq -#n_r2=~/data/fastq/na12878/na_2.fq -#n_r1=~/fastq/na12878/na12878_r1.fq -#n_r2=~/fastq/na12878/na12878_r2.fq -#n_r1=~/fastq/n_r1.fq -#n_r2=~/fastq/n_r2.fq -#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq -#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq -#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq -#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq -#reference=~/data/reference/human_g1k_v37_decoy.fasta -#n_r1=~/data/fastq/sn_r1.fq -#n_r2=~/data/fastq/sn_r2.fq -#n_r1=~/data/fastq/ssn_r1.fq -#n_r2=~/data/fastq/ssn_r2.fq -#n_r1=~/fastq/ERR194147_1_120w.fastq -#n_r2=~/fastq/ERR194147_2_120w.fastq -#n_r1=~/fastq/tiny_n_r1.fq -#n_r2=~/fastq/tiny_n_r2.fq -#n_r1=~/fastq/diff_r1.fq -#n_r2=~/fastq/diff_r2.fq -#n_r1=~/fastq/d_r1.fq -#n_r2=~/fastq/d_r2.fq -reference=~/reference/human_g1k_v37_decoy.fasta -#reference=~/data/reference/human_g1k_v37_decoy.fasta +#reference=~/reference/bwa/human_g1k_v37_decoy.fasta +reference=~/data/reference/human_g1k_v37_decoy.fasta #out=./all.sam #out=./sn.sam #out=./ssn-x1.sam @@ -75,7 +35,7 @@ out=/dev/null # /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \ # -o /dev/null -time ./bwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +time ./fastbwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ $reference \ $n_r1 \ $n_r2 \ diff --git a/bwa.sh b/scripts/bwa.sh similarity index 100% rename from bwa.sh rename to scripts/bwa.sh diff --git a/ert.sh b/scripts/ert.sh similarity index 100% rename from ert.sh rename to scripts/ert.sh diff --git a/mem2.sh b/scripts/mem2.sh similarity index 100% rename from mem2.sh rename to scripts/mem2.sh diff --git a/na.sh b/scripts/na.sh similarity index 100% rename from na.sh rename to scripts/na.sh diff --git a/sbwa.sh b/scripts/sbwa.sh similarity index 100% rename from sbwa.sh rename to scripts/sbwa.sh diff --git a/seed1.py b/scripts/seed1.py similarity index 100% rename from seed1.py rename to scripts/seed1.py diff --git a/seed2.py b/scripts/seed2.py similarity index 100% rename from seed2.py rename to scripts/seed2.py diff --git a/sw1.py b/scripts/sw1.py similarity index 100% rename from sw1.py rename to scripts/sw1.py diff --git a/utils.h b/utils.h index 621e9ee..b752e71 100644 --- a/utils.h +++ b/utils.h @@ -31,6 +31,17 @@ #include #include +// for debug and test + +//#define DEBUG_OUTPUT // 打开gfp1-4文件,并记录debug信息 +//#define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度 +//#define GET_FULL_MATCH_READ // 获取完全匹配的reads +//#define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量 +// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据 +//#define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里 + +//////////////////// + #define USE_RDTSC 1 #ifdef SHOW_PERF @@ -96,6 +107,7 @@ typedef struct { } pair64_t; typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; uint32_t *a; } uint32_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; typedef struct { size_t m; uint8_t *addr; } buf_t;