From 2eaeb26858a6cf27447c29abf4a5d2d452c57b20 Mon Sep 17 00:00:00 2001 From: zzh Date: Thu, 14 Mar 2024 15:32:34 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E7=9B=B4=E6=8E=A5?= =?UTF-8?q?=E6=89=A9=E5=B1=95=E4=B8=A4=E4=B8=AA=E7=A2=B1=E5=9F=BA=E7=9A=84?= =?UTF-8?q?=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 4 +- bwamem.c | 4 ++ fastmap.c | 1 + fmt_idx.c | 119 +++++++++++++++++++++++++++++++++++++++----- run.sh | 32 ++++++++---- 5 files changed, 137 insertions(+), 23 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index abe1e3d..e19ece5 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -18,8 +18,8 @@ "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "~/reference/human_g1k_v37_decoy.fasta", - "~/fastq/ssn_r1.fq", - "~/fastq/ssn_r2.fq", + "~/data/fastq/ds/d1_1.fq", + "~/data/fastq/ds/d1_2.fq", "-o", "/dev/null" ], diff --git a/bwamem.c b/bwamem.c index 621786d..b287346 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1407,6 +1407,10 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in tmp_time = realtime_msec(); #endif + s2n += 1; + if (max_seed_len == len - start_N_num) + s1n += 1; + #ifdef FILTER_FULL_MATCH if (max_seed_len == len - start_N_num) goto collect_intv_end; #endif diff --git a/fastmap.c b/fastmap.c index c4422a3..a14dbd7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -458,6 +458,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "time_seed_3: %f s\n", avg_seed_3 / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa: %f s\n", avg_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_bsw: %f s\n", avg_bsw / 1000.0 / opt->n_threads); + fprintf(stderr, "full_match: %ld, all: %ld\n", s1n, s2n); fprintf(stderr, "\n"); fclose(gfp1); diff --git a/fmt_idx.c b/fmt_idx.c index 4bc5eb1..c63b85a 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -669,18 +669,117 @@ static inline void fmt_reverse_intvs(bwtintv_v *p) } } +int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem) +{ + int i, ret, kmer_len; + bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; + bwtintv_t mt = {0}; + uint32_t qbit = 0; + mem->n = 0; + if (q[x] > 3) return x + 1; + + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 + + qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置 + ik.info = x + kmer_len; + +// check change of the interval size and whether the interval size is too small to be extended further +#define PUSH_VAL_AND_SKIP_FORWARD(iv) \ + do \ + { \ + kv_push(bwtintv_t, *mem, iv); \ + goto fmt_smem_forward_end; \ + } while (0) + + if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后 + PUSH_VAL_AND_SKIP_FORWARD(ik); + + // 扩展kmer之后的碱基 + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); + for (i = (int)ik.info; i + 1 < len; i += 2) + { // forward search + if (q[i] < 4 && q[i + 1] < 4) + { + fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); +#if 1 + if (min_intv == 1 && ok2.x[2] == min_intv) + { + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); + kv_push(bwtintv_t, *mem, mt); + ret = (uint32_t)mt.info; + goto fmt_smem_forward_end; + } +#endif + if (ok2.x[2] < min_intv) { + if (ok1.x[2] < min_intv) { + PUSH_VAL_AND_SKIP_FORWARD(ik); + } else { + ok1.info = i + 1; + PUSH_VAL_AND_SKIP_FORWARD(ok1); + } + } else { + ik = ok2; + ik.info = i + 2; + } + } + else if (q[i] < 4) // q[i+1] >= 4 + { + fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); + if (ok1.x[2] < min_intv) { + PUSH_VAL_AND_SKIP_FORWARD(ik); + } else { + ok1.info = i + 1; + PUSH_VAL_AND_SKIP_FORWARD(ok1); + } + } + else // q[i] >= 4 + { + PUSH_VAL_AND_SKIP_FORWARD(ik); + } + } + for (; i == len - 1; ++i) // 扩展到了最后一个碱基 + { + if (q[i] < 4) + { + fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); + if (ok1.x[2] < min_intv) { + PUSH_VAL_AND_SKIP_FORWARD(ik); + } else { + ok1.info = i + 1; + PUSH_VAL_AND_SKIP_FORWARD(ok1); + } + } + else + PUSH_VAL_AND_SKIP_FORWARD(ik); + } + +fmt_smem_forward_end: + if (mem->n == 0) + kv_push(bwtintv_t, *mem, ik); + ret = mem->a[0].info; + mem->a[0].info |= (uint64_t)(x) << 32; + if (mt.num_match == 0) + ret = (uint32_t)mem->a[0].info; // this will be the returned value,扩展到的最远的位置 + return ret; +} + // 找smem(seed) int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec) { + if (x == 0 || q[x - 1] > 3) + return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem); + int i, j, ret, kmer_len; bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; bwtintv_t mt = {0}; bwtintv_v *curr; uint32_t qbit = 0; mem->n = 0; - int only_forward = x == 0 || q[x - 1] > 3; - if (q[x] > 3) - return x + 1; + if (q[x] > 3) return x + 1; if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 @@ -708,11 +807,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv } while (0) // 处理kmer对应的匹配信息 - if (only_forward) - j = kmer_len - 1; - else - j = 1; - for (curr->n = 0; j < kmer_len; ++j) + for (curr->n = 0, j = 1; j < kmer_len; ++j) { bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j); CHECK_INTV_CHANGE(ik, ok1, x + j + 1); @@ -739,7 +834,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); kv_push(bwtintv_t, *mem, mt); ret = (uint32_t)mt.info; - if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; goto backward_search; } @@ -793,9 +888,9 @@ backward_search: for (j = 0; j < curr->n; ++j) { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 - // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); - if (!only_forward && p->info - x < HASH_KMER_LEN) + // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); + if (p->info - x < HASH_KMER_LEN) { if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer diff --git a/run.sh b/run.sh index 7b75b29..b7a899f 100755 --- a/run.sh +++ b/run.sh @@ -1,10 +1,24 @@ -thread=32 +thread=1 +#n_r1=~/data/fastq/zy/wgs/n1.fq +#n_r2=~/data/fastq/zy/wgs/n2.fq +#n_r1=~/data/fastq/ds/n1.fq +#n_r2=~/data/fastq/ds/n2.fq +#n_r1=~/data/fastq/platinum/n1.fq +#n_r2=~/data/fastq/platinum/n2.fq +#n_r1=~/data/fastq/zy/t1.fq +#n_r2=~/data/fastq/zy/t2.fq +n_r1=~/data/fastq/zy/n1.fq +n_r2=~/data/fastq/zy/n2.fq +#n_r1=~/data/fastq/ds/d1_1.fq +#n_r2=~/data/fastq/ds/d1_2.fq +#n_r1=~/data/fastq/ds/wes/n1.fq +#n_r2=~/data/fastq/ds/wes/n2.fq #n_r1=~/fastq/ZY2105177532213010_L4_1.fq #n_r2=~/fastq/ZY2105177532213010_L4_2.fq -#n_r1=~/fastq/na12878/nas_1.fq -#n_r2=~/fastq/na12878/nas_2.fq -n_r1=~/data/fastq/na12878/na_1.fq -n_r2=~/data/fastq/na12878/na_2.fq +#n_r1=~/data/fastq/na12878/nas_1.fq +#n_r2=~/data/fastq/na12878/nas_2.fq +#n_r1=~/data/fastq/na12878/na_1.fq +#n_r2=~/data/fastq/na12878/na_2.fq #n_r1=~/fastq/na12878/na12878_r1.fq #n_r2=~/fastq/na12878/na12878_r2.fq #n_r1=~/fastq/n_r1.fq @@ -14,10 +28,10 @@ n_r2=~/data/fastq/na12878/na_2.fq #n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq #n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq #reference=~/data/reference/human_g1k_v37_decoy.fasta -#n_r1=~/fastq/sn_r1.fq -#n_r2=~/fastq/sn_r2.fq -#n_r1=~/fastq/ssn_r1.fq -#n_r2=~/fastq/ssn_r2.fq +#n_r1=~/data/fastq/sn_r1.fq +#n_r2=~/data/fastq/sn_r2.fq +#n_r1=~/data/fastq/ssn_r1.fq +#n_r2=~/data/fastq/ssn_r2.fq #n_r1=~/fastq/ERR194147_1_120w.fastq #n_r2=~/fastq/ERR194147_2_120w.fastq #n_r1=~/fastq/tiny_n_r1.fq