添加了bit过滤,解决了一些bug,现在seed1和seed2都没问题了

This commit is contained in:
zzh 2024-02-16 00:18:14 +08:00
parent 5d8ace386f
commit 9d45fd02fb
6 changed files with 76 additions and 40 deletions

4
.vscode/launch.json vendored
View File

@ -18,8 +18,8 @@
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/human_g1k_v37_decoy.fasta",
"~/fastq/sn_r1.fq",
"~/fastq/sn_r2.fq",
"~/fastq/d_r1.fq",
"~/fastq/d_r2.fq",
"-o",
"/dev/null"
],

11
bwa.c
View File

@ -293,12 +293,13 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
FMTIndex *bwa_idx_load_fmt(const char *hint)
{
char *fmt_idx_fn, *kmer_idx_fn, *sa_fn;
char *fmt_idx_fn, *kmer_idx_fn, *kmer_bit_fn, *sa_fn;
FMTIndex *fmt;
char suffix[32];
int l_hint = strlen(hint);
fmt_idx_fn = malloc(l_hint + 32);
kmer_idx_fn = malloc(l_hint + 32);
kmer_bit_fn = malloc(l_hint + 32);
sa_fn = malloc(l_hint + 32);
sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
strcpy(fmt_idx_fn, hint);
@ -317,6 +318,14 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
fprintf(stderr, "%s\n", kmer_idx_fn);
fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn);
// restore kmer bit
fmt->bit_kmer_len = 18;
sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
strcpy(kmer_bit_fn, hint);
strcpy(kmer_bit_fn + l_hint, suffix);
fprintf(stderr, "%s\n", kmer_bit_fn);
fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
strcpy(sa_fn, hint);
sprintf(suffix, ".33.%d.sa", SA_INTV);
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)

View File

@ -173,11 +173,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
// if (slen < 16) ++n16;
// if (slen < 17) ++n17;
// if (slen < 18) ++n18;
// if (slen < 19) ++n19;
// ++nall;
if (slen < 16) ++n16;
if (slen < 17) ++n17;
if (slen < 18) ++n18;
if (slen < 19) ++n19;
++nall;
max_seed_len = fmax(max_seed_len, slen);
if (slen >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, *p);
@ -186,7 +186,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
}
// second pass: find MEMs inside a long SMEM
//if (max_seed_len == len)
// goto collect_intv_end;
//goto collect_intv_end;
old_n = a->mem.n;
for (k = 0; k < old_n; ++k) {
@ -196,7 +196,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
// bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
//bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
@ -217,7 +217,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
}
}
// goto collect_intv_end;
goto collect_intv_end;
// third pass: LAST-like
if (opt->max_mem_intv > 0) {

View File

@ -155,6 +155,16 @@ KmerHash fmt_restore_kmer_idx(const char *fn)
return khash;
}
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len)
{
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
FILE *fp;
fp = xopen(fn, "rb");
fread_fix(fp, (1L << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
err_fclose(fp);
return kbit;
}
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
{
@ -587,36 +597,36 @@ static void fmt_reverse_intvs(bwtintv_v *p)
}
// 创建正向的kmer
inline static uint32_t build_forward_kmer(const uint8_t *q, int qlen, int *base_consumed)
inline static uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
{
uint32_t qbit = 0;
uint64_t qbit = 0;
int i;
qlen = qlen < HASH_KMER_LEN ? qlen : HASH_KMER_LEN;
qlen = qlen < kmer_len ? qlen : kmer_len;
for (i = 0; i < qlen; ++i)
{
if (q[i] > 3) // 要考虑碱基是N
break;
qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1);
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
}
*base_consumed = i;
return qbit;
}
// 创建f反向的kmer
inline static uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int *base_consumed)
inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
{
uint32_t qbit = 0;
uint64_t qbit = 0;
int i, j, end_pos;
end_pos = start_pos - HASH_KMER_LEN;
end_pos = start_pos - kmer_len;
end_pos = end_pos < 0 ? -1 : end_pos;
for (i = start_pos, j = 0; i > end_pos; --i, ++j)
{
if (q[i] > 3) // 要考虑碱基是N
break;
qbit |= q[i] << ((HASH_KMER_LEN - 1 - j) << 1);
qbit |= ((uint64_t)q[i]) << ((kmer_len - 1 - j) << 1);
}
*base_consumed = start_pos - i;
return (~qbit) & ((1 << (HASH_KMER_LEN << 1)) - 1);
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
}
// 当x为0或者q[x-1]为N时只需要前向搜索即可
@ -630,7 +640,7 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti
int i, j = 1, ret, kmer_len;
const int min_intv = 1;
bwtintv_t ik, ok1, ok2;
uint32_t qbit = build_forward_kmer(&q[x], len - x, &kmer_len);
uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
mem->n = 0;
fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++);
@ -645,8 +655,8 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti
// 继续向前扩展
for (i = x + kmer_len; i + 1 < len; i += 2) {
//if (ik.x[2] < 2)
// goto fmt_smem_forward_end;
if (ik.x[2] < 2)
goto fmt_smem_forward_end;
if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
@ -684,7 +694,8 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti
fmt_smem_forward_end:
ret = ik.info;
ik.info |= (uint64_t)x << 32;
kv_push(bwtintv_t, *mem, ik);
if (fmt->bit_kmer_len <= (uint32_t)ik.info - (ik.info >> 32))
kv_push(bwtintv_t, *mem, ik);
#ifdef SHOW_PERF
#if 1
tmp_time = realtime_msec() - tmp_time;
@ -703,15 +714,16 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
bwtintv_v a[1], *curr;
uint32_t qbit = 0;
mem->n = 0;
int only_forward = x == 0 || q[x - 1] > 3;
if (q[x] > 3) return x + 1;
// if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展
//if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]);
curr = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided
qbit = build_forward_kmer(&q[x], len - x, &kmer_len);
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置
ik.info = x + 1;
@ -775,10 +787,22 @@ backward_search:
for (j = 0; j < curr->n; ++j)
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
if (p->info - x < HASH_KMER_LEN) {
uint32_t qbit = build_backward_kmer(q, p->info - 1, &kmer_len); // 创建反向kmer
uint64_t qbit = 0;
// if (p->info - x < fmt->bit_kmer_len) {
// qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
// if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
// continue;
// }
if (!only_forward && p->info - x < HASH_KMER_LEN) {
//qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
//if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
// continue;
//qbit >>= (fmt->bit_kmer_len - HASH_KMER_LEN) << 1;
//kmer_len = kmer_len > HASH_KMER_LEN ? HASH_KMER_LEN : kmer_len;
i = 1;
do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
if (i > 2) continue;
p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2];
i = p->info - (kmer_len - i + 3);
} else {
@ -798,7 +822,9 @@ backward_search:
{
fmt_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info; *p = ok1;
ok1.info = p->info;
CHECK_ADD_MEM(i, ok1, mem);
goto fmt_smem_end;
}
else
{ // 不能扩展

View File

@ -77,14 +77,6 @@ typedef struct
KmerEntry *ke14;
} KmerHash;
// 用来检测15,16,17这些长度的序列是否在bwt索引里有匹配
typedef struct
{
uint8_t *kb15;
uint8_t *kb16;
uint8_t *kb17;
} KmerBit;
// fm-index, extend twice in one search step (one memory access)
typedef struct
{
@ -101,7 +93,8 @@ typedef struct
uint8_t last_base; // dollar转换成的base
// 保存kmer对应的fmt位置信息
KmerHash kmer_hash;
KmerBit kmer_bit; // 用来
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
int bit_kmer_len;
// suffix array
int sa_intv;
bwtint_t n_sa;
@ -116,6 +109,8 @@ FMTIndex *fmt_restore_fmt(const char *fn);
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
// 从文件中读取kmer hash信息
KmerHash fmt_restore_kmer_idx(const char *fn);
// 读取kmer bit数据
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len);
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
// 根据interval-bwt创建fmt-index

10
run.sh
View File

@ -7,8 +7,14 @@ thread=1
n_r1=~/fastq/sn_r1.fq
n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/tiny_n_r1.fq
#n_r2=~/fastq/tiny_n_r2.fq
#n_r2=~/fastq/tiny_n_r3.fq
#n_r1=~/fastq/diff_r1.fq
#n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/d_r1.fq
#n_r2=~/fastq/d_r2.fq
reference=~/reference/human_g1k_v37_decoy.fasta
out=./fmt_all.sam
#out=/dev/null
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
# /home/zzh/data/fastq/nm1.fq \
@ -24,6 +30,6 @@ time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:n
$reference \
$n_r1 \
$n_r2 \
-o /dev/null
-o $out