添加了直接扩展两个碱基的函数

This commit is contained in:
zzh 2024-03-14 15:32:34 +08:00
parent 856a0e0c01
commit 2eaeb26858
5 changed files with 137 additions and 23 deletions

4
.vscode/launch.json vendored
View File

@ -18,8 +18,8 @@
"-R", "-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/human_g1k_v37_decoy.fasta", "~/reference/human_g1k_v37_decoy.fasta",
"~/fastq/ssn_r1.fq", "~/data/fastq/ds/d1_1.fq",
"~/fastq/ssn_r2.fq", "~/data/fastq/ds/d1_2.fq",
"-o", "-o",
"/dev/null" "/dev/null"
], ],

View File

@ -1407,6 +1407,10 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
tmp_time = realtime_msec(); tmp_time = realtime_msec();
#endif #endif
s2n += 1;
if (max_seed_len == len - start_N_num)
s1n += 1;
#ifdef FILTER_FULL_MATCH #ifdef FILTER_FULL_MATCH
if (max_seed_len == len - start_N_num) goto collect_intv_end; if (max_seed_len == len - start_N_num) goto collect_intv_end;
#endif #endif

View File

@ -458,6 +458,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, "time_seed_3: %f s\n", avg_seed_3 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_3: %f s\n", avg_seed_3 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_sa: %f s\n", avg_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa: %f s\n", avg_sa / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bsw: %f s\n", avg_bsw / 1000.0 / opt->n_threads); fprintf(stderr, "time_bsw: %f s\n", avg_bsw / 1000.0 / opt->n_threads);
fprintf(stderr, "full_match: %ld, all: %ld\n", s1n, s2n);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fclose(gfp1); fclose(gfp1);

119
fmt_idx.c
View File

@ -669,18 +669,117 @@ static inline void fmt_reverse_intvs(bwtintv_v *p)
} }
} }
int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem)
{
int i, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
uint32_t qbit = 0;
mem->n = 0;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置
ik.info = x + kmer_len;
// check change of the interval size and whether the interval size is too small to be extended further
#define PUSH_VAL_AND_SKIP_FORWARD(iv) \
do \
{ \
kv_push(bwtintv_t, *mem, iv); \
goto fmt_smem_forward_end; \
} while (0)
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
PUSH_VAL_AND_SKIP_FORWARD(ik);
// 扩展kmer之后的碱基
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
#if 1
if (min_intv == 1 && ok2.x[2] == min_intv)
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
goto fmt_smem_forward_end;
}
#endif
if (ok2.x[2] < min_intv) {
if (ok1.x[2] < min_intv) {
PUSH_VAL_AND_SKIP_FORWARD(ik);
} else {
ok1.info = i + 1;
PUSH_VAL_AND_SKIP_FORWARD(ok1);
}
} else {
ik = ok2;
ik.info = i + 2;
}
}
else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
if (ok1.x[2] < min_intv) {
PUSH_VAL_AND_SKIP_FORWARD(ik);
} else {
ok1.info = i + 1;
PUSH_VAL_AND_SKIP_FORWARD(ok1);
}
}
else // q[i] >= 4
{
PUSH_VAL_AND_SKIP_FORWARD(ik);
}
}
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
{
if (q[i] < 4)
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
if (ok1.x[2] < min_intv) {
PUSH_VAL_AND_SKIP_FORWARD(ik);
} else {
ok1.info = i + 1;
PUSH_VAL_AND_SKIP_FORWARD(ok1);
}
}
else
PUSH_VAL_AND_SKIP_FORWARD(ik);
}
fmt_smem_forward_end:
if (mem->n == 0)
kv_push(bwtintv_t, *mem, ik);
ret = mem->a[0].info;
mem->a[0].info |= (uint64_t)(x) << 32;
if (mt.num_match == 0)
ret = (uint32_t)mem->a[0].info; // this will be the returned value扩展到的最远的位置
return ret;
}
// 找smemseed // 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec) int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec)
{ {
if (x == 0 || q[x - 1] > 3)
return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem);
int i, j, ret, kmer_len; int i, j, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0}; bwtintv_t mt = {0};
bwtintv_v *curr; bwtintv_v *curr;
uint32_t qbit = 0; uint32_t qbit = 0;
mem->n = 0; mem->n = 0;
int only_forward = x == 0 || q[x - 1] > 3; if (q[x] > 3) return x + 1;
if (q[x] > 3)
return x + 1;
if (min_intv < 1) if (min_intv < 1)
min_intv = 1; // the interval size should be at least 1 min_intv = 1; // the interval size should be at least 1
@ -708,11 +807,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
} while (0) } while (0)
// 处理kmer对应的匹配信息 // 处理kmer对应的匹配信息
if (only_forward) for (curr->n = 0, j = 1; j < kmer_len; ++j)
j = kmer_len - 1;
else
j = 1;
for (curr->n = 0; j < kmer_len; ++j)
{ {
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j); bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
CHECK_INTV_CHANGE(ik, ok1, x + j + 1); CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
@ -739,7 +834,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
kv_push(bwtintv_t, *mem, mt); kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info; ret = (uint32_t)mt.info;
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end; goto fmt_smem_end;
goto backward_search; goto backward_search;
} }
@ -793,9 +888,9 @@ backward_search:
for (j = 0; j < curr->n; ++j) for (j = 0; j < curr->n; ++j)
{ {
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
if (!only_forward && p->info - x < HASH_KMER_LEN) if (p->info - x < HASH_KMER_LEN)
{ {
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer

32
run.sh
View File

@ -1,10 +1,24 @@
thread=32 thread=1
#n_r1=~/data/fastq/zy/wgs/n1.fq
#n_r2=~/data/fastq/zy/wgs/n2.fq
#n_r1=~/data/fastq/ds/n1.fq
#n_r2=~/data/fastq/ds/n2.fq
#n_r1=~/data/fastq/platinum/n1.fq
#n_r2=~/data/fastq/platinum/n2.fq
#n_r1=~/data/fastq/zy/t1.fq
#n_r2=~/data/fastq/zy/t2.fq
n_r1=~/data/fastq/zy/n1.fq
n_r2=~/data/fastq/zy/n2.fq
#n_r1=~/data/fastq/ds/d1_1.fq
#n_r2=~/data/fastq/ds/d1_2.fq
#n_r1=~/data/fastq/ds/wes/n1.fq
#n_r2=~/data/fastq/ds/wes/n2.fq
#n_r1=~/fastq/ZY2105177532213010_L4_1.fq #n_r1=~/fastq/ZY2105177532213010_L4_1.fq
#n_r2=~/fastq/ZY2105177532213010_L4_2.fq #n_r2=~/fastq/ZY2105177532213010_L4_2.fq
#n_r1=~/fastq/na12878/nas_1.fq #n_r1=~/data/fastq/na12878/nas_1.fq
#n_r2=~/fastq/na12878/nas_2.fq #n_r2=~/data/fastq/na12878/nas_2.fq
n_r1=~/data/fastq/na12878/na_1.fq #n_r1=~/data/fastq/na12878/na_1.fq
n_r2=~/data/fastq/na12878/na_2.fq #n_r2=~/data/fastq/na12878/na_2.fq
#n_r1=~/fastq/na12878/na12878_r1.fq #n_r1=~/fastq/na12878/na12878_r1.fq
#n_r2=~/fastq/na12878/na12878_r2.fq #n_r2=~/fastq/na12878/na12878_r2.fq
#n_r1=~/fastq/n_r1.fq #n_r1=~/fastq/n_r1.fq
@ -14,10 +28,10 @@ n_r2=~/data/fastq/na12878/na_2.fq
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq #n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq #n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
#reference=~/data/reference/human_g1k_v37_decoy.fasta #reference=~/data/reference/human_g1k_v37_decoy.fasta
#n_r1=~/fastq/sn_r1.fq #n_r1=~/data/fastq/sn_r1.fq
#n_r2=~/fastq/sn_r2.fq #n_r2=~/data/fastq/sn_r2.fq
#n_r1=~/fastq/ssn_r1.fq #n_r1=~/data/fastq/ssn_r1.fq
#n_r2=~/fastq/ssn_r2.fq #n_r2=~/data/fastq/ssn_r2.fq
#n_r1=~/fastq/ERR194147_1_120w.fastq #n_r1=~/fastq/ERR194147_1_120w.fastq
#n_r2=~/fastq/ERR194147_2_120w.fastq #n_r2=~/fastq/ERR194147_2_120w.fastq
#n_r1=~/fastq/tiny_n_r1.fq #n_r1=~/fastq/tiny_n_r1.fq