From 3e20d7ee0fcb00e2f27ce3c9621333d8bdfb71ae Mon Sep 17 00:00:00 2001 From: zzh Date: Sat, 6 Apr 2024 15:05:20 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E4=BA=86=E5=88=9B=E5=BB=BAbw?= =?UTF-8?q?t=E7=B4=A2=E5=BC=95=E6=97=B6=EF=BC=8C=E4=B8=80=E5=90=8C?= =?UTF-8?q?=E5=88=9B=E5=BB=BAfmt=E7=9B=B8=E5=85=B3=E7=9A=84=E7=B4=A2?= =?UTF-8?q?=E5=BC=95=E7=9B=B8=E5=85=B3=E7=9A=84bug=EF=BC=8C=E7=8E=B0?= =?UTF-8?q?=E5=9C=A8=E5=8F=AF=E4=BB=A5=E6=AD=A3=E5=B8=B8=E4=B8=80=E8=B5=B7?= =?UTF-8?q?=E5=88=9B=E5=BB=BA=E7=B4=A2=E5=BC=95=E4=BA=86=EF=BC=8C=E6=8E=A5?= =?UTF-8?q?=E4=B8=8B=E6=9D=A5=E8=BF=98=E5=8F=AF=E4=BB=A5=E5=B0=86sa?= =?UTF-8?q?=E5=92=8Cbytesa=E4=B8=80=E8=B5=B7=E5=88=9B=E5=BB=BA=E6=9D=A5?= =?UTF-8?q?=E5=87=8F=E5=B0=8F=E6=97=B6=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 2 +- Makefile | 4 +-- bwtindex.c | 8 +++-- fmt_idx.c | 75 ++++++++++++++++++++++++++++++++++++++++++++- run.sh | 28 +++++++++++++---- 5 files changed, 105 insertions(+), 12 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index b71f59c..7a727e3 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,7 +17,7 @@ "-M", "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", - "~/reference/human_g1k_v37_decoy.fasta", + "~/reference/bwa/human_g1k_v37_decoy.fasta", "~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq", "~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq", "-o", diff --git a/Makefile b/Makefile index 461441c..10e9363 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF SHOW_DATA_PERF= #-DSHOW_DATA_PERF FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH -USE_MT_READ= #-DUSE_MT_READ +USE_MT_READ= -DUSE_MT_READ AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL @@ -22,7 +22,7 @@ PROG= fastbwa INCLUDES= LIBS= -lm -lz -lpthread -ldl SUBDIRS= . -JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a +JE_MALLOC=#/home/zzh/work/jemalloc/lib/libjemalloc.a ifeq ($(shell uname -s),Linux) LIBS += -lrt diff --git a/bwtindex.c b/bwtindex.c index cf07e8a..c8a1e75 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -367,24 +367,28 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s bwt = bwt_restore_bwt(str); bwt_cal_sa(bwt, 32); bwt_dump_sa(str3, bwt); - bwt_destroy(bwt); +// bwt_destroy(bwt); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); { - t = clock(); // build FMT-Index + t = clock(); FMTIndex *fmt; strcpy(str, prefix); strcat(str, ".fmt"); fmt = create_fmt_from_bwt(bwt); dump_fmt(str, fmt); + if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); // create Kmer-Hash + t = clock(); fmt_create_kmer_index(fmt); strcpy(str, prefix); strcat(str, ".kmer"); fmt_dump_kmer_idx(str, &fmt->kmer_hash); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); // create byte sa + t = clock(); bwt_cal_byte_sa(bwt, 4); strcpy(str, prefix); strcat(str, ".bytesa"); bwt_dump_byte_sa(str, bwt); + if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } } free(str3); free(str2); free(str); diff --git a/fmt_idx.c b/fmt_idx.c index ed9b1f0..706dce2 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -1258,7 +1258,7 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_ } // k, k1, k2都是bwt矩阵对应的行 -inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) +inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) { uint8_t b1, b2; bwtint_t tk[4], kk; @@ -1269,6 +1269,68 @@ inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t * *k2 = fmt->L2[b2] + tk[3]; } +inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) +{ + uint8_t b1, b2; + uint32_t x = 0; + bwtint_t cnt[4]; + k = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) + uint32_t *p, *pocc, *ptocc, tmp; + uint8_t base2; + bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); + // 第一步,找到check point位置 + pocc = fmt_occ_intv(fmt, k); // check point起始位置 + p = pocc + 20; // bwt碱基起始位置 + // 第二步,找到mid check point位置 + int mk = k & FMT_OCC_INTV_MASK; + int n_mintv = mk >> FMT_MID_INTV_SHIFT; + p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置 + ptocc = p; + // 第三步,找到具体的uint32_t + p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基) + // 第四步,获取碱基 + base2 = *p >> ((~(k) & 0x7) << 2) & 0xf; + b2 = base2 >> 2 & 3; + b1 = base2 & 3; + + cnt[1] = pocc[b1]; + cnt[3] = (pocc + 4 + b1 * 4)[b2]; + if (n_mintv > 0) { + ptocc -= 4; + x = *(ptocc + b1); + cnt[1] += __fmt_mid_sum(x); + cnt[3] += x >> (b2 << 3) & 0xff; + x = 0; + ptocc += 4; + } + + uint32_t *end = ptocc + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3)); + int ti = b1 << 2 | b2; + for (; ptocc < end; ++ptocc) + { + x += __fmt_occ_e2_aux2(fmt, ti, *ptocc); + } + + tmp = *ptocc & ~((1U << ((~k & 7) << 2)) - 1); + x += __fmt_occ_e2_aux2(fmt, ti, tmp); + + if (b1 == 0) + { + x -= (~k & 7) << 8; + if (b2 == 0) + x -= (~k & 7) << 24; + } + if (b1 == fmt->first_base && b2 == fmt->last_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary) + { + cnt[3] -= 1; + } + cnt[1] += x >> 8 & 0xff; + cnt[3] += x >> 24 & 0xff; + + *k1 = fmt->L2[b1] + cnt[1]; + *k2 = fmt->L2[b2] + cnt[3]; +} + bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) { bwtint_t sa = 0, mask = fmt->sa_intv - 1; @@ -1277,6 +1339,7 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) { ++sa; fmt_previous_line(fmt, k, &k1, &k2); + //fmt_previous_line_old(fmt, k, &k1, &k2); __builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2); if (!(k1 & mask)) { @@ -1284,6 +1347,11 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) break; } ++sa; + if (!(k2 & mask)) + { + k = k2; + break; + } k = k2; } sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv); @@ -1305,6 +1373,11 @@ bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k) break; } ++sa; + if (!(k2 & mask)) + { + k = k2; + break; + } k = k2; } sa = (sa << 48) | (k / fmt->sa_intv); diff --git a/run.sh b/run.sh index 2c93b5c..0fd12e7 100755 --- a/run.sh +++ b/run.sh @@ -1,28 +1,44 @@ -thread=1 +thread=64 ## d1 -n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq -n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq +#n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq +#n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq +#n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq +#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq +#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz +#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz ## d2 #n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq #n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq +n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq +n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq # d3 #n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq #n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq +#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz +#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz ## d4 #n_r1=~/data/fastq/dataset/zy_wes/s_1.fq #n_r2=~/data/fastq/dataset/zy_wes/s_2.fq +#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz +#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz ## d5 +#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz +#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz #n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq #n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq +#n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq +#n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq +reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta #reference=~/reference/bwa/human_g1k_v37_decoy.fasta -reference=~/data/reference/human_g1k_v37_decoy.fasta +#reference=~/data/reference/human_g1k_v37_decoy.fasta #out=./all.sam #out=./sn.sam #out=./ssn-x1.sam -#out=./out.sam -out=/dev/null +out=~/data1/d2-45m.sam +#out=~/data/out1.sam +#out=/dev/null #out=./na12878.sam #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \