From 9d45fd02fbddee8fd4eca1b3d06bc9c0c12262b1 Mon Sep 17 00:00:00 2001 From: zzh Date: Fri, 16 Feb 2024 00:18:14 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86bit=E8=BF=87?= =?UTF-8?q?=E6=BB=A4=EF=BC=8C=E8=A7=A3=E5=86=B3=E4=BA=86=E4=B8=80=E4=BA=9B?= =?UTF-8?q?bug=EF=BC=8C=E7=8E=B0=E5=9C=A8seed1=E5=92=8Cseed2=E9=83=BD?= =?UTF-8?q?=E6=B2=A1=E9=97=AE=E9=A2=98=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 4 +-- bwa.c | 11 +++++++- bwamem.c | 16 ++++++------ fmt_idx.c | 62 ++++++++++++++++++++++++++++++++------------- fmt_idx.h | 13 +++------- run.sh | 10 ++++++-- 6 files changed, 76 insertions(+), 40 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 39dbc60..54910f9 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -18,8 +18,8 @@ "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "~/reference/human_g1k_v37_decoy.fasta", - "~/fastq/sn_r1.fq", - "~/fastq/sn_r2.fq", + "~/fastq/d_r1.fq", + "~/fastq/d_r2.fq", "-o", "/dev/null" ], diff --git a/bwa.c b/bwa.c index 3a37a32..cc16c59 100644 --- a/bwa.c +++ b/bwa.c @@ -293,12 +293,13 @@ bwt_t *bwa_idx_load_bwt(const char *hint) FMTIndex *bwa_idx_load_fmt(const char *hint) { - char *fmt_idx_fn, *kmer_idx_fn, *sa_fn; + char *fmt_idx_fn, *kmer_idx_fn, *kmer_bit_fn, *sa_fn; FMTIndex *fmt; char suffix[32]; int l_hint = strlen(hint); fmt_idx_fn = malloc(l_hint + 32); kmer_idx_fn = malloc(l_hint + 32); + kmer_bit_fn = malloc(l_hint + 32); sa_fn = malloc(l_hint + 32); sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL); strcpy(fmt_idx_fn, hint); @@ -317,6 +318,14 @@ FMTIndex *bwa_idx_load_fmt(const char *hint) fprintf(stderr, "%s\n", kmer_idx_fn); fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn); + // restore kmer bit + fmt->bit_kmer_len = 18; + sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len); + strcpy(kmer_bit_fn, hint); + strcpy(kmer_bit_fn + l_hint, suffix); + fprintf(stderr, "%s\n", kmer_bit_fn); + fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len); + strcpy(sa_fn, hint); sprintf(suffix, ".33.%d.sa", SA_INTV); strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA) diff --git a/bwamem.c b/bwamem.c index a4c8944..5a9d606 100644 --- a/bwamem.c +++ b/bwamem.c @@ -173,11 +173,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); int slen = (uint32_t)p->info - (p->info >> 32); // seed length - // if (slen < 16) ++n16; - // if (slen < 17) ++n17; - // if (slen < 18) ++n18; - // if (slen < 19) ++n19; - // ++nall; + if (slen < 16) ++n16; + if (slen < 17) ++n17; + if (slen < 18) ++n18; + if (slen < 19) ++n19; + ++nall; max_seed_len = fmax(max_seed_len, slen); if (slen >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, *p); @@ -186,7 +186,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn } // second pass: find MEMs inside a long SMEM //if (max_seed_len == len) - // goto collect_intv_end; + //goto collect_intv_end; old_n = a->mem.n; for (k = 0; k < old_n; ++k) { @@ -196,7 +196,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - // bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); + //bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; @@ -217,7 +217,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn } } - // goto collect_intv_end; + goto collect_intv_end; // third pass: LAST-like if (opt->max_mem_intv > 0) { diff --git a/fmt_idx.c b/fmt_idx.c index 078bc44..406a904 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -155,6 +155,16 @@ KmerHash fmt_restore_kmer_idx(const char *fn) return khash; } +uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len) +{ + uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); + FILE *fp; + fp = xopen(fn, "rb"); + fread_fix(fp, (1L << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit); + err_fclose(fp); + return kbit; +} + // 读取sa数据 void fmt_restore_sa(const char *fn, FMTIndex *fmt) { @@ -587,36 +597,36 @@ static void fmt_reverse_intvs(bwtintv_v *p) } // 创建正向的kmer -inline static uint32_t build_forward_kmer(const uint8_t *q, int qlen, int *base_consumed) +inline static uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed) { - uint32_t qbit = 0; + uint64_t qbit = 0; int i; - qlen = qlen < HASH_KMER_LEN ? qlen : HASH_KMER_LEN; + qlen = qlen < kmer_len ? qlen : kmer_len; for (i = 0; i < qlen; ++i) { if (q[i] > 3) // 要考虑碱基是N break; - qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1); + qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1); } *base_consumed = i; return qbit; } // 创建f反向的kmer -inline static uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int *base_consumed) +inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed) { - uint32_t qbit = 0; + uint64_t qbit = 0; int i, j, end_pos; - end_pos = start_pos - HASH_KMER_LEN; + end_pos = start_pos - kmer_len; end_pos = end_pos < 0 ? -1 : end_pos; for (i = start_pos, j = 0; i > end_pos; --i, ++j) { if (q[i] > 3) // 要考虑碱基是N break; - qbit |= q[i] << ((HASH_KMER_LEN - 1 - j) << 1); + qbit |= ((uint64_t)q[i]) << ((kmer_len - 1 - j) << 1); } *base_consumed = start_pos - i; - return (~qbit) & ((1 << (HASH_KMER_LEN << 1)) - 1); + return (~qbit) & ((1L << (kmer_len << 1)) - 1); } // 当x为0,或者q[x-1]为N时,只需要前向搜索即可 @@ -630,7 +640,7 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti int i, j = 1, ret, kmer_len; const int min_intv = 1; bwtintv_t ik, ok1, ok2; - uint32_t qbit = build_forward_kmer(&q[x], len - x, &kmer_len); + uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); mem->n = 0; fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++); @@ -645,8 +655,8 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti // 继续向前扩展 for (i = x + kmer_len; i + 1 < len; i += 2) { - //if (ik.x[2] < 2) - // goto fmt_smem_forward_end; + if (ik.x[2] < 2) + goto fmt_smem_forward_end; if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展 { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); @@ -684,7 +694,8 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti fmt_smem_forward_end: ret = ik.info; ik.info |= (uint64_t)x << 32; - kv_push(bwtintv_t, *mem, ik); + if (fmt->bit_kmer_len <= (uint32_t)ik.info - (ik.info >> 32)) + kv_push(bwtintv_t, *mem, ik); #ifdef SHOW_PERF #if 1 tmp_time = realtime_msec() - tmp_time; @@ -703,15 +714,16 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv bwtintv_v a[1], *curr; uint32_t qbit = 0; mem->n = 0; + int only_forward = x == 0 || q[x - 1] > 3; if (q[x] > 3) return x + 1; - // if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展 + //if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展 if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); curr = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided - qbit = build_forward_kmer(&q[x], len - x, &kmer_len); + qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置 ik.info = x + 1; @@ -775,10 +787,22 @@ backward_search: for (j = 0; j < curr->n; ++j) { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 - if (p->info - x < HASH_KMER_LEN) { - uint32_t qbit = build_backward_kmer(q, p->info - 1, &kmer_len); // 创建反向kmer + uint64_t qbit = 0; + // if (p->info - x < fmt->bit_kmer_len) { + // qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer + // if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf)))) + // continue; + // } + if (!only_forward && p->info - x < HASH_KMER_LEN) { + //qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer + qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer + //if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf)))) + // continue; + //qbit >>= (fmt->bit_kmer_len - HASH_KMER_LEN) << 1; + //kmer_len = kmer_len > HASH_KMER_LEN ? HASH_KMER_LEN : kmer_len; i = 1; do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv); + if (i > 2) continue; p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2]; i = p->info - (kmer_len - i + 3); } else { @@ -798,7 +822,9 @@ backward_search: { fmt_extend1(fmt, p, &ok1, 1, q[i]); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); - ok1.info = p->info; *p = ok1; + ok1.info = p->info; + CHECK_ADD_MEM(i, ok1, mem); + goto fmt_smem_end; } else { // 不能扩展 diff --git a/fmt_idx.h b/fmt_idx.h index 7459a88..cbf3b5b 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -77,14 +77,6 @@ typedef struct KmerEntry *ke14; } KmerHash; -// 用来检测15,16,17这些长度的序列是否在bwt索引里有匹配 -typedef struct -{ - uint8_t *kb15; - uint8_t *kb16; - uint8_t *kb17; -} KmerBit; - // fm-index, extend twice in one search step (one memory access) typedef struct { @@ -101,7 +93,8 @@ typedef struct uint8_t last_base; // dollar转换成的base // 保存kmer对应的fmt位置信息 KmerHash kmer_hash; - KmerBit kmer_bit; // 用来 + uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配 + int bit_kmer_len; // suffix array int sa_intv; bwtint_t n_sa; @@ -116,6 +109,8 @@ FMTIndex *fmt_restore_fmt(const char *fn); void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh); // 从文件中读取kmer hash信息 KmerHash fmt_restore_kmer_idx(const char *fn); +// 读取kmer bit数据 +uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len); // 读取sa数据 void fmt_restore_sa(const char *fn, FMTIndex *fmt); // 根据interval-bwt创建fmt-index diff --git a/run.sh b/run.sh index f61e300..59d46db 100755 --- a/run.sh +++ b/run.sh @@ -7,8 +7,14 @@ thread=1 n_r1=~/fastq/sn_r1.fq n_r2=~/fastq/sn_r2.fq #n_r1=~/fastq/tiny_n_r1.fq -#n_r2=~/fastq/tiny_n_r2.fq +#n_r2=~/fastq/tiny_n_r3.fq +#n_r1=~/fastq/diff_r1.fq +#n_r2=~/fastq/diff_r2.fq +#n_r1=~/fastq/d_r1.fq +#n_r2=~/fastq/d_r2.fq reference=~/reference/human_g1k_v37_decoy.fasta +out=./fmt_all.sam +#out=/dev/null #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ # /home/zzh/data/fastq/nm1.fq \ @@ -24,6 +30,6 @@ time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:n $reference \ $n_r1 \ $n_r2 \ - -o /dev/null + -o $out