seed三个步骤,全部用fmt实现,而且结果一致
This commit is contained in:
parent
9d45fd02fb
commit
7dceae8c5d
60
bwamem.c
60
bwamem.c
|
|
@ -144,6 +144,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
||||||
int start_width = 1;
|
int start_width = 1;
|
||||||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||||||
int max_seed_len = 0;
|
int max_seed_len = 0;
|
||||||
|
int start_N_num = 0, start_flag = 1;
|
||||||
a->mem.n = 0;
|
a->mem.n = 0;
|
||||||
// char *BASE = "ACGT";
|
// char *BASE = "ACGT";
|
||||||
// for (i = 0; i < len; ++i)
|
// for (i = 0; i < len; ++i)
|
||||||
|
|
@ -157,8 +158,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
||||||
//fprintf(fp1, "seq: %ld\n", dn++);
|
//fprintf(fp1, "seq: %ld\n", dn++);
|
||||||
//fprintf(stderr, "seq: %ld\n", dn++);
|
//fprintf(stderr, "seq: %ld\n", dn++);
|
||||||
//dn ++;
|
//dn ++;
|
||||||
|
//goto third_seed;
|
||||||
|
|
||||||
while (x < len) {
|
while (x < len) {
|
||||||
if (seq[x] < 4) {
|
if (seq[x] < 4) {
|
||||||
|
start_flag = 0;
|
||||||
#ifdef SHOW_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_time = realtime_msec();
|
int64_t tmp_time = realtime_msec();
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -172,22 +176,27 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
||||||
bwtintv_t *p = &a->mem1.a[i];
|
bwtintv_t *p = &a->mem1.a[i];
|
||||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
//int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||||
if (slen < 16) ++n16;
|
//if (slen < 16) ++n16;
|
||||||
if (slen < 17) ++n17;
|
//if (slen < 17) ++n17;
|
||||||
if (slen < 18) ++n18;
|
//if (slen < 18) ++n18;
|
||||||
if (slen < 19) ++n19;
|
//if (slen < 19) ++n19;
|
||||||
++nall;
|
//++nall;
|
||||||
max_seed_len = fmax(max_seed_len, slen);
|
max_seed_len = fmax(max_seed_len, slen);
|
||||||
if (slen >= opt->min_seed_len)
|
if (slen >= opt->min_seed_len)
|
||||||
kv_push(bwtintv_t, a->mem, *p);
|
kv_push(bwtintv_t, a->mem, *p);
|
||||||
}
|
}
|
||||||
} else ++x;
|
} else {
|
||||||
|
++x;
|
||||||
|
if (start_flag)
|
||||||
|
++start_N_num;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// second pass: find MEMs inside a long SMEM
|
// second pass: find MEMs inside a long SMEM
|
||||||
//if (max_seed_len == len)
|
//if (max_seed_len == len - start_N_num)
|
||||||
//goto collect_intv_end;
|
// goto collect_intv_end;
|
||||||
|
// goto third_seed;
|
||||||
|
|
||||||
old_n = a->mem.n;
|
old_n = a->mem.n;
|
||||||
for (k = 0; k < old_n; ++k) {
|
for (k = 0; k < old_n; ++k) {
|
||||||
bwtintv_t *p = &a->mem.a[k];
|
bwtintv_t *p = &a->mem.a[k];
|
||||||
|
|
@ -206,19 +215,20 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
||||||
bwtintv_t *p = &a->mem1.a[i];
|
bwtintv_t *p = &a->mem1.a[i];
|
||||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
//int slen = (uint32_t)p->info - (p->info >> 32);
|
||||||
if (slen < 16) ++n16;
|
//if (slen < 16) ++n16;
|
||||||
if (slen < 17) ++n17;
|
//if (slen < 17) ++n17;
|
||||||
if (slen < 18) ++n18;
|
//if (slen < 18) ++n18;
|
||||||
if (slen < 19) ++n19;
|
//if (slen < 19) ++n19;
|
||||||
++nall;
|
//++nall;
|
||||||
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len)
|
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len)
|
||||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
//if (max_seed_len == len - start_N_num)
|
||||||
goto collect_intv_end;
|
// goto collect_intv_end;
|
||||||
|
//goto collect_intv_end;
|
||||||
|
third_seed:
|
||||||
// third pass: LAST-like
|
// third pass: LAST-like
|
||||||
if (opt->max_mem_intv > 0) {
|
if (opt->max_mem_intv > 0) {
|
||||||
x = 0;
|
x = 0;
|
||||||
|
|
@ -229,12 +239,20 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
||||||
#ifdef SHOW_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_time = realtime_msec();
|
int64_t tmp_time = realtime_msec();
|
||||||
#endif
|
#endif
|
||||||
x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
//x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||||
|
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||||
#ifdef SHOW_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_time = realtime_msec() - tmp_time;
|
tmp_time = realtime_msec() - tmp_time;
|
||||||
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
|
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
|
||||||
#endif
|
#endif
|
||||||
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
|
//bwtintv_t *p = &m;
|
||||||
|
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||||
|
|
||||||
|
if (m.x[2] > 0) {
|
||||||
|
kv_push(bwtintv_t, a->mem, m);
|
||||||
|
//bwtintv_t *p = &m;
|
||||||
|
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||||
|
}
|
||||||
} else { // for now, we never come to this block which is slower
|
} else { // for now, we never come to this block which is slower
|
||||||
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
|
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
|
||||||
for (i = 0; i < a->mem1.n; ++i)
|
for (i = 0; i < a->mem1.n; ++i)
|
||||||
|
|
|
||||||
51
fmt_idx.c
51
fmt_idx.c
|
|
@ -854,4 +854,55 @@ fmt_smem_end:
|
||||||
if (tmpvec == 0 || tmpvec[0] == 0)
|
if (tmpvec == 0 || tmpvec[0] == 0)
|
||||||
free(a[0].a);
|
free(a[0].a);
|
||||||
return ret;
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||||
|
{
|
||||||
|
int i, kmer_len;
|
||||||
|
bwtintv_t ik, ok1, ok2;
|
||||||
|
uint64_t qbit;
|
||||||
|
|
||||||
|
|
||||||
|
memset(mem, 0, sizeof(bwtintv_t));
|
||||||
|
if (q[x] > 3)
|
||||||
|
return x + 1;
|
||||||
|
|
||||||
|
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||||
|
|
||||||
|
fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置
|
||||||
|
ik.info = x + kmer_len;
|
||||||
|
|
||||||
|
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
|
||||||
|
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
|
||||||
|
{ \
|
||||||
|
(ov) = (iv); \
|
||||||
|
(ov).info = (uint64_t)start_pos << 32 | (end_pos + 1); \
|
||||||
|
return end_pos + 1; \
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||||
|
{ // forward search
|
||||||
|
if (q[i] < 4 && q[i + 1] < 4)
|
||||||
|
{
|
||||||
|
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||||
|
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||||
|
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
|
||||||
|
ik = ok2;
|
||||||
|
}
|
||||||
|
else if (q[i] < 4) // q[i+1] >= 4
|
||||||
|
{
|
||||||
|
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||||
|
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||||
|
return i + 2;
|
||||||
|
}
|
||||||
|
else // q[i] >= 4
|
||||||
|
{
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i == len - 1) {
|
||||||
|
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||||
|
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||||
|
}
|
||||||
|
return len;
|
||||||
}
|
}
|
||||||
|
|
@ -131,4 +131,5 @@ void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
|
||||||
// 找smem(seed)
|
// 找smem(seed)
|
||||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||||
|
|
||||||
|
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||||
#endif
|
#endif
|
||||||
10
run.sh
10
run.sh
|
|
@ -4,16 +4,16 @@ thread=1
|
||||||
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
||||||
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
||||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||||
n_r1=~/fastq/sn_r1.fq
|
#n_r1=~/fastq/sn_r1.fq
|
||||||
n_r2=~/fastq/sn_r2.fq
|
#n_r2=~/fastq/sn_r2.fq
|
||||||
#n_r1=~/fastq/tiny_n_r1.fq
|
n_r1=~/fastq/tiny_n_r1.fq
|
||||||
#n_r2=~/fastq/tiny_n_r3.fq
|
n_r2=~/fastq/tiny_n_r2.fq
|
||||||
#n_r1=~/fastq/diff_r1.fq
|
#n_r1=~/fastq/diff_r1.fq
|
||||||
#n_r2=~/fastq/diff_r2.fq
|
#n_r2=~/fastq/diff_r2.fq
|
||||||
#n_r1=~/fastq/d_r1.fq
|
#n_r1=~/fastq/d_r1.fq
|
||||||
#n_r2=~/fastq/d_r2.fq
|
#n_r2=~/fastq/d_r2.fq
|
||||||
reference=~/reference/human_g1k_v37_decoy.fasta
|
reference=~/reference/human_g1k_v37_decoy.fasta
|
||||||
out=./fmt_all.sam
|
out=./out.sam
|
||||||
#out=/dev/null
|
#out=/dev/null
|
||||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue