添加了直接扩展两个碱基的函数
This commit is contained in:
parent
856a0e0c01
commit
2eaeb26858
|
|
@ -18,8 +18,8 @@
|
|||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"~/reference/human_g1k_v37_decoy.fasta",
|
||||
"~/fastq/ssn_r1.fq",
|
||||
"~/fastq/ssn_r2.fq",
|
||||
"~/data/fastq/ds/d1_1.fq",
|
||||
"~/data/fastq/ds/d1_2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
],
|
||||
|
|
|
|||
4
bwamem.c
4
bwamem.c
|
|
@ -1407,6 +1407,10 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
tmp_time = realtime_msec();
|
||||
#endif
|
||||
|
||||
s2n += 1;
|
||||
if (max_seed_len == len - start_N_num)
|
||||
s1n += 1;
|
||||
|
||||
#ifdef FILTER_FULL_MATCH
|
||||
if (max_seed_len == len - start_N_num) goto collect_intv_end;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -458,6 +458,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, "time_seed_3: %f s\n", avg_seed_3 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_sa: %f s\n", avg_sa / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bsw: %f s\n", avg_bsw / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "full_match: %ld, all: %ld\n", s1n, s2n);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fclose(gfp1);
|
||||
|
|
|
|||
115
fmt_idx.c
115
fmt_idx.c
|
|
@ -669,18 +669,117 @@ static inline void fmt_reverse_intvs(bwtintv_v *p)
|
|||
}
|
||||
}
|
||||
|
||||
int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem)
|
||||
{
|
||||
int i, ret, kmer_len;
|
||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
||||
bwtintv_t mt = {0};
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
if (q[x] > 3) return x + 1;
|
||||
|
||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
||||
|
||||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置
|
||||
ik.info = x + kmer_len;
|
||||
|
||||
// check change of the interval size and whether the interval size is too small to be extended further
|
||||
#define PUSH_VAL_AND_SKIP_FORWARD(iv) \
|
||||
do \
|
||||
{ \
|
||||
kv_push(bwtintv_t, *mem, iv); \
|
||||
goto fmt_smem_forward_end; \
|
||||
} while (0)
|
||||
|
||||
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
#if 1
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
goto fmt_smem_forward_end;
|
||||
}
|
||||
#endif
|
||||
if (ok2.x[2] < min_intv) {
|
||||
if (ok1.x[2] < min_intv) {
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
} else {
|
||||
ok1.info = i + 1;
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ok1);
|
||||
}
|
||||
} else {
|
||||
ik = ok2;
|
||||
ik.info = i + 2;
|
||||
}
|
||||
}
|
||||
else if (q[i] < 4) // q[i+1] >= 4
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
if (ok1.x[2] < min_intv) {
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
} else {
|
||||
ok1.info = i + 1;
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ok1);
|
||||
}
|
||||
}
|
||||
else // q[i] >= 4
|
||||
{
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
}
|
||||
}
|
||||
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
|
||||
{
|
||||
if (q[i] < 4)
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
if (ok1.x[2] < min_intv) {
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
} else {
|
||||
ok1.info = i + 1;
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ok1);
|
||||
}
|
||||
}
|
||||
else
|
||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
}
|
||||
|
||||
fmt_smem_forward_end:
|
||||
if (mem->n == 0)
|
||||
kv_push(bwtintv_t, *mem, ik);
|
||||
ret = mem->a[0].info;
|
||||
mem->a[0].info |= (uint64_t)(x) << 32;
|
||||
if (mt.num_match == 0)
|
||||
ret = (uint32_t)mem->a[0].info; // this will be the returned value,扩展到的最远的位置
|
||||
return ret;
|
||||
}
|
||||
|
||||
// 找smem(seed)
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec)
|
||||
{
|
||||
if (x == 0 || q[x - 1] > 3)
|
||||
return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem);
|
||||
|
||||
int i, j, ret, kmer_len;
|
||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
||||
bwtintv_t mt = {0};
|
||||
bwtintv_v *curr;
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
int only_forward = x == 0 || q[x - 1] > 3;
|
||||
if (q[x] > 3)
|
||||
return x + 1;
|
||||
if (q[x] > 3) return x + 1;
|
||||
|
||||
if (min_intv < 1)
|
||||
min_intv = 1; // the interval size should be at least 1
|
||||
|
|
@ -708,11 +807,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
} while (0)
|
||||
|
||||
// 处理kmer对应的匹配信息
|
||||
if (only_forward)
|
||||
j = kmer_len - 1;
|
||||
else
|
||||
j = 1;
|
||||
for (curr->n = 0; j < kmer_len; ++j)
|
||||
for (curr->n = 0, j = 1; j < kmer_len; ++j)
|
||||
{
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
|
||||
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
|
||||
|
|
@ -739,7 +834,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
goto backward_search;
|
||||
}
|
||||
|
|
@ -795,7 +890,7 @@ backward_search:
|
|||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
||||
if (!only_forward && p->info - x < HASH_KMER_LEN)
|
||||
if (p->info - x < HASH_KMER_LEN)
|
||||
{
|
||||
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
||||
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
|
||||
|
|
|
|||
32
run.sh
32
run.sh
|
|
@ -1,10 +1,24 @@
|
|||
thread=32
|
||||
thread=1
|
||||
#n_r1=~/data/fastq/zy/wgs/n1.fq
|
||||
#n_r2=~/data/fastq/zy/wgs/n2.fq
|
||||
#n_r1=~/data/fastq/ds/n1.fq
|
||||
#n_r2=~/data/fastq/ds/n2.fq
|
||||
#n_r1=~/data/fastq/platinum/n1.fq
|
||||
#n_r2=~/data/fastq/platinum/n2.fq
|
||||
#n_r1=~/data/fastq/zy/t1.fq
|
||||
#n_r2=~/data/fastq/zy/t2.fq
|
||||
n_r1=~/data/fastq/zy/n1.fq
|
||||
n_r2=~/data/fastq/zy/n2.fq
|
||||
#n_r1=~/data/fastq/ds/d1_1.fq
|
||||
#n_r2=~/data/fastq/ds/d1_2.fq
|
||||
#n_r1=~/data/fastq/ds/wes/n1.fq
|
||||
#n_r2=~/data/fastq/ds/wes/n2.fq
|
||||
#n_r1=~/fastq/ZY2105177532213010_L4_1.fq
|
||||
#n_r2=~/fastq/ZY2105177532213010_L4_2.fq
|
||||
#n_r1=~/fastq/na12878/nas_1.fq
|
||||
#n_r2=~/fastq/na12878/nas_2.fq
|
||||
n_r1=~/data/fastq/na12878/na_1.fq
|
||||
n_r2=~/data/fastq/na12878/na_2.fq
|
||||
#n_r1=~/data/fastq/na12878/nas_1.fq
|
||||
#n_r2=~/data/fastq/na12878/nas_2.fq
|
||||
#n_r1=~/data/fastq/na12878/na_1.fq
|
||||
#n_r2=~/data/fastq/na12878/na_2.fq
|
||||
#n_r1=~/fastq/na12878/na12878_r1.fq
|
||||
#n_r2=~/fastq/na12878/na12878_r2.fq
|
||||
#n_r1=~/fastq/n_r1.fq
|
||||
|
|
@ -14,10 +28,10 @@ n_r2=~/data/fastq/na12878/na_2.fq
|
|||
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#n_r1=~/fastq/sn_r1.fq
|
||||
#n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/ssn_r1.fq
|
||||
#n_r2=~/fastq/ssn_r2.fq
|
||||
#n_r1=~/data/fastq/sn_r1.fq
|
||||
#n_r2=~/data/fastq/sn_r2.fq
|
||||
#n_r1=~/data/fastq/ssn_r1.fq
|
||||
#n_r2=~/data/fastq/ssn_r2.fq
|
||||
#n_r1=~/fastq/ERR194147_1_120w.fastq
|
||||
#n_r2=~/fastq/ERR194147_2_120w.fastq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
|
|
|
|||
Loading…
Reference in New Issue