添加了直接扩展两个碱基的函数

This commit is contained in:
zzh 2024-03-14 15:32:34 +08:00
parent 856a0e0c01
commit 2eaeb26858
5 changed files with 137 additions and 23 deletions

4
.vscode/launch.json vendored
View File

@ -18,8 +18,8 @@
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/human_g1k_v37_decoy.fasta",
"~/fastq/ssn_r1.fq",
"~/fastq/ssn_r2.fq",
"~/data/fastq/ds/d1_1.fq",
"~/data/fastq/ds/d1_2.fq",
"-o",
"/dev/null"
],

View File

@ -1407,6 +1407,10 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
tmp_time = realtime_msec();
#endif
s2n += 1;
if (max_seed_len == len - start_N_num)
s1n += 1;
#ifdef FILTER_FULL_MATCH
if (max_seed_len == len - start_N_num) goto collect_intv_end;
#endif

View File

@ -458,6 +458,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, "time_seed_3: %f s\n", avg_seed_3 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_sa: %f s\n", avg_sa / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bsw: %f s\n", avg_bsw / 1000.0 / opt->n_threads);
fprintf(stderr, "full_match: %ld, all: %ld\n", s1n, s2n);
fprintf(stderr, "\n");
fclose(gfp1);

115
fmt_idx.c
View File

@ -669,18 +669,117 @@ static inline void fmt_reverse_intvs(bwtintv_v *p)
}
}
int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem)
{
int i, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
uint32_t qbit = 0;
mem->n = 0;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置
ik.info = x + kmer_len;
// check change of the interval size and whether the interval size is too small to be extended further
#define PUSH_VAL_AND_SKIP_FORWARD(iv) \
do \
{ \
kv_push(bwtintv_t, *mem, iv); \
goto fmt_smem_forward_end; \
} while (0)
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
PUSH_VAL_AND_SKIP_FORWARD(ik);
// 扩展kmer之后的碱基
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
#if 1
if (min_intv == 1 && ok2.x[2] == min_intv)
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
goto fmt_smem_forward_end;
}
#endif
if (ok2.x[2] < min_intv) {
if (ok1.x[2] < min_intv) {
PUSH_VAL_AND_SKIP_FORWARD(ik);
} else {
ok1.info = i + 1;
PUSH_VAL_AND_SKIP_FORWARD(ok1);
}
} else {
ik = ok2;
ik.info = i + 2;
}
}
else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
if (ok1.x[2] < min_intv) {
PUSH_VAL_AND_SKIP_FORWARD(ik);
} else {
ok1.info = i + 1;
PUSH_VAL_AND_SKIP_FORWARD(ok1);
}
}
else // q[i] >= 4
{
PUSH_VAL_AND_SKIP_FORWARD(ik);
}
}
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
{
if (q[i] < 4)
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
if (ok1.x[2] < min_intv) {
PUSH_VAL_AND_SKIP_FORWARD(ik);
} else {
ok1.info = i + 1;
PUSH_VAL_AND_SKIP_FORWARD(ok1);
}
}
else
PUSH_VAL_AND_SKIP_FORWARD(ik);
}
fmt_smem_forward_end:
if (mem->n == 0)
kv_push(bwtintv_t, *mem, ik);
ret = mem->a[0].info;
mem->a[0].info |= (uint64_t)(x) << 32;
if (mt.num_match == 0)
ret = (uint32_t)mem->a[0].info; // this will be the returned value扩展到的最远的位置
return ret;
}
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec)
{
if (x == 0 || q[x - 1] > 3)
return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem);
int i, j, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
bwtintv_v *curr;
uint32_t qbit = 0;
mem->n = 0;
int only_forward = x == 0 || q[x - 1] > 3;
if (q[x] > 3)
return x + 1;
if (q[x] > 3) return x + 1;
if (min_intv < 1)
min_intv = 1; // the interval size should be at least 1
@ -708,11 +807,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
} while (0)
// 处理kmer对应的匹配信息
if (only_forward)
j = kmer_len - 1;
else
j = 1;
for (curr->n = 0; j < kmer_len; ++j)
for (curr->n = 0, j = 1; j < kmer_len; ++j)
{
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
@ -739,7 +834,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
goto backward_search;
}
@ -795,7 +890,7 @@ backward_search:
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
if (!only_forward && p->info - x < HASH_KMER_LEN)
if (p->info - x < HASH_KMER_LEN)
{
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer

32
run.sh
View File

@ -1,10 +1,24 @@
thread=32
thread=1
#n_r1=~/data/fastq/zy/wgs/n1.fq
#n_r2=~/data/fastq/zy/wgs/n2.fq
#n_r1=~/data/fastq/ds/n1.fq
#n_r2=~/data/fastq/ds/n2.fq
#n_r1=~/data/fastq/platinum/n1.fq
#n_r2=~/data/fastq/platinum/n2.fq
#n_r1=~/data/fastq/zy/t1.fq
#n_r2=~/data/fastq/zy/t2.fq
n_r1=~/data/fastq/zy/n1.fq
n_r2=~/data/fastq/zy/n2.fq
#n_r1=~/data/fastq/ds/d1_1.fq
#n_r2=~/data/fastq/ds/d1_2.fq
#n_r1=~/data/fastq/ds/wes/n1.fq
#n_r2=~/data/fastq/ds/wes/n2.fq
#n_r1=~/fastq/ZY2105177532213010_L4_1.fq
#n_r2=~/fastq/ZY2105177532213010_L4_2.fq
#n_r1=~/fastq/na12878/nas_1.fq
#n_r2=~/fastq/na12878/nas_2.fq
n_r1=~/data/fastq/na12878/na_1.fq
n_r2=~/data/fastq/na12878/na_2.fq
#n_r1=~/data/fastq/na12878/nas_1.fq
#n_r2=~/data/fastq/na12878/nas_2.fq
#n_r1=~/data/fastq/na12878/na_1.fq
#n_r2=~/data/fastq/na12878/na_2.fq
#n_r1=~/fastq/na12878/na12878_r1.fq
#n_r2=~/fastq/na12878/na12878_r2.fq
#n_r1=~/fastq/n_r1.fq
@ -14,10 +28,10 @@ n_r2=~/data/fastq/na12878/na_2.fq
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#n_r1=~/fastq/sn_r1.fq
#n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/ssn_r1.fq
#n_r2=~/fastq/ssn_r2.fq
#n_r1=~/data/fastq/sn_r1.fq
#n_r2=~/data/fastq/sn_r2.fq
#n_r1=~/data/fastq/ssn_r1.fq
#n_r2=~/data/fastq/ssn_r2.fq
#n_r1=~/fastq/ERR194147_1_120w.fastq
#n_r2=~/fastq/ERR194147_2_120w.fastq
#n_r1=~/fastq/tiny_n_r1.fq