添加了bit过滤,解决了一些bug,现在seed1和seed2都没问题了
This commit is contained in:
parent
5d8ace386f
commit
9d45fd02fb
|
|
@ -18,8 +18,8 @@
|
|||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"~/reference/human_g1k_v37_decoy.fasta",
|
||||
"~/fastq/sn_r1.fq",
|
||||
"~/fastq/sn_r2.fq",
|
||||
"~/fastq/d_r1.fq",
|
||||
"~/fastq/d_r2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
],
|
||||
|
|
|
|||
11
bwa.c
11
bwa.c
|
|
@ -293,12 +293,13 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
|
||||
FMTIndex *bwa_idx_load_fmt(const char *hint)
|
||||
{
|
||||
char *fmt_idx_fn, *kmer_idx_fn, *sa_fn;
|
||||
char *fmt_idx_fn, *kmer_idx_fn, *kmer_bit_fn, *sa_fn;
|
||||
FMTIndex *fmt;
|
||||
char suffix[32];
|
||||
int l_hint = strlen(hint);
|
||||
fmt_idx_fn = malloc(l_hint + 32);
|
||||
kmer_idx_fn = malloc(l_hint + 32);
|
||||
kmer_bit_fn = malloc(l_hint + 32);
|
||||
sa_fn = malloc(l_hint + 32);
|
||||
sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
|
||||
strcpy(fmt_idx_fn, hint);
|
||||
|
|
@ -317,6 +318,14 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
fprintf(stderr, "%s\n", kmer_idx_fn);
|
||||
fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn);
|
||||
|
||||
// restore kmer bit
|
||||
fmt->bit_kmer_len = 18;
|
||||
sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
|
||||
strcpy(kmer_bit_fn, hint);
|
||||
strcpy(kmer_bit_fn + l_hint, suffix);
|
||||
fprintf(stderr, "%s\n", kmer_bit_fn);
|
||||
fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
|
||||
|
||||
strcpy(sa_fn, hint);
|
||||
sprintf(suffix, ".33.%d.sa", SA_INTV);
|
||||
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
|
||||
|
|
|
|||
16
bwamem.c
16
bwamem.c
|
|
@ -173,11 +173,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
// if (slen < 16) ++n16;
|
||||
// if (slen < 17) ++n17;
|
||||
// if (slen < 18) ++n18;
|
||||
// if (slen < 19) ++n19;
|
||||
// ++nall;
|
||||
if (slen < 16) ++n16;
|
||||
if (slen < 17) ++n17;
|
||||
if (slen < 18) ++n18;
|
||||
if (slen < 19) ++n19;
|
||||
++nall;
|
||||
max_seed_len = fmax(max_seed_len, slen);
|
||||
if (slen >= opt->min_seed_len)
|
||||
kv_push(bwtintv_t, a->mem, *p);
|
||||
|
|
@ -186,7 +186,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
}
|
||||
// second pass: find MEMs inside a long SMEM
|
||||
//if (max_seed_len == len)
|
||||
// goto collect_intv_end;
|
||||
//goto collect_intv_end;
|
||||
|
||||
old_n = a->mem.n;
|
||||
for (k = 0; k < old_n; ++k) {
|
||||
|
|
@ -196,7 +196,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
// bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
|
||||
//bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
|
||||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
|
|
@ -217,7 +217,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
}
|
||||
}
|
||||
|
||||
// goto collect_intv_end;
|
||||
goto collect_intv_end;
|
||||
|
||||
// third pass: LAST-like
|
||||
if (opt->max_mem_intv > 0) {
|
||||
|
|
|
|||
62
fmt_idx.c
62
fmt_idx.c
|
|
@ -155,6 +155,16 @@ KmerHash fmt_restore_kmer_idx(const char *fn)
|
|||
return khash;
|
||||
}
|
||||
|
||||
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len)
|
||||
{
|
||||
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "rb");
|
||||
fread_fix(fp, (1L << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
|
||||
err_fclose(fp);
|
||||
return kbit;
|
||||
}
|
||||
|
||||
// 读取sa数据
|
||||
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
|
||||
{
|
||||
|
|
@ -587,36 +597,36 @@ static void fmt_reverse_intvs(bwtintv_v *p)
|
|||
}
|
||||
|
||||
// 创建正向的kmer
|
||||
inline static uint32_t build_forward_kmer(const uint8_t *q, int qlen, int *base_consumed)
|
||||
inline static uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint32_t qbit = 0;
|
||||
uint64_t qbit = 0;
|
||||
int i;
|
||||
qlen = qlen < HASH_KMER_LEN ? qlen : HASH_KMER_LEN;
|
||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||
for (i = 0; i < qlen; ++i)
|
||||
{
|
||||
if (q[i] > 3) // 要考虑碱基是N
|
||||
break;
|
||||
qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1);
|
||||
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
|
||||
}
|
||||
*base_consumed = i;
|
||||
return qbit;
|
||||
}
|
||||
|
||||
// 创建f反向的kmer
|
||||
inline static uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int *base_consumed)
|
||||
inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint32_t qbit = 0;
|
||||
uint64_t qbit = 0;
|
||||
int i, j, end_pos;
|
||||
end_pos = start_pos - HASH_KMER_LEN;
|
||||
end_pos = start_pos - kmer_len;
|
||||
end_pos = end_pos < 0 ? -1 : end_pos;
|
||||
for (i = start_pos, j = 0; i > end_pos; --i, ++j)
|
||||
{
|
||||
if (q[i] > 3) // 要考虑碱基是N
|
||||
break;
|
||||
qbit |= q[i] << ((HASH_KMER_LEN - 1 - j) << 1);
|
||||
qbit |= ((uint64_t)q[i]) << ((kmer_len - 1 - j) << 1);
|
||||
}
|
||||
*base_consumed = start_pos - i;
|
||||
return (~qbit) & ((1 << (HASH_KMER_LEN << 1)) - 1);
|
||||
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
||||
}
|
||||
|
||||
// 当x为0,或者q[x-1]为N时,只需要前向搜索即可
|
||||
|
|
@ -630,7 +640,7 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti
|
|||
int i, j = 1, ret, kmer_len;
|
||||
const int min_intv = 1;
|
||||
bwtintv_t ik, ok1, ok2;
|
||||
uint32_t qbit = build_forward_kmer(&q[x], len - x, &kmer_len);
|
||||
uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
mem->n = 0;
|
||||
|
||||
fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++);
|
||||
|
|
@ -645,8 +655,8 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti
|
|||
|
||||
// 继续向前扩展
|
||||
for (i = x + kmer_len; i + 1 < len; i += 2) {
|
||||
//if (ik.x[2] < 2)
|
||||
// goto fmt_smem_forward_end;
|
||||
if (ik.x[2] < 2)
|
||||
goto fmt_smem_forward_end;
|
||||
if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
|
|
@ -684,7 +694,8 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti
|
|||
fmt_smem_forward_end:
|
||||
ret = ik.info;
|
||||
ik.info |= (uint64_t)x << 32;
|
||||
kv_push(bwtintv_t, *mem, ik);
|
||||
if (fmt->bit_kmer_len <= (uint32_t)ik.info - (ik.info >> 32))
|
||||
kv_push(bwtintv_t, *mem, ik);
|
||||
#ifdef SHOW_PERF
|
||||
#if 1
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
|
|
@ -703,15 +714,16 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
bwtintv_v a[1], *curr;
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
int only_forward = x == 0 || q[x - 1] > 3;
|
||||
|
||||
if (q[x] > 3) return x + 1;
|
||||
// if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展
|
||||
//if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展
|
||||
|
||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
||||
kv_init(a[0]);
|
||||
curr = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided
|
||||
|
||||
qbit = build_forward_kmer(&q[x], len - x, &kmer_len);
|
||||
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置
|
||||
ik.info = x + 1;
|
||||
|
||||
|
|
@ -775,10 +787,22 @@ backward_search:
|
|||
for (j = 0; j < curr->n; ++j)
|
||||
{
|
||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
if (p->info - x < HASH_KMER_LEN) {
|
||||
uint32_t qbit = build_backward_kmer(q, p->info - 1, &kmer_len); // 创建反向kmer
|
||||
uint64_t qbit = 0;
|
||||
// if (p->info - x < fmt->bit_kmer_len) {
|
||||
// qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
|
||||
// if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
|
||||
// continue;
|
||||
// }
|
||||
if (!only_forward && p->info - x < HASH_KMER_LEN) {
|
||||
//qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
|
||||
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
||||
//if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
|
||||
// continue;
|
||||
//qbit >>= (fmt->bit_kmer_len - HASH_KMER_LEN) << 1;
|
||||
//kmer_len = kmer_len > HASH_KMER_LEN ? HASH_KMER_LEN : kmer_len;
|
||||
i = 1;
|
||||
do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
|
||||
if (i > 2) continue;
|
||||
p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2];
|
||||
i = p->info - (kmer_len - i + 3);
|
||||
} else {
|
||||
|
|
@ -798,7 +822,9 @@ backward_search:
|
|||
{
|
||||
fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info; *p = ok1;
|
||||
ok1.info = p->info;
|
||||
CHECK_ADD_MEM(i, ok1, mem);
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
else
|
||||
{ // 不能扩展
|
||||
|
|
|
|||
13
fmt_idx.h
13
fmt_idx.h
|
|
@ -77,14 +77,6 @@ typedef struct
|
|||
KmerEntry *ke14;
|
||||
} KmerHash;
|
||||
|
||||
// 用来检测15,16,17这些长度的序列是否在bwt索引里有匹配
|
||||
typedef struct
|
||||
{
|
||||
uint8_t *kb15;
|
||||
uint8_t *kb16;
|
||||
uint8_t *kb17;
|
||||
} KmerBit;
|
||||
|
||||
// fm-index, extend twice in one search step (one memory access)
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -101,7 +93,8 @@ typedef struct
|
|||
uint8_t last_base; // dollar转换成的base
|
||||
// 保存kmer对应的fmt位置信息
|
||||
KmerHash kmer_hash;
|
||||
KmerBit kmer_bit; // 用来
|
||||
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
|
||||
int bit_kmer_len;
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
|
|
@ -116,6 +109,8 @@ FMTIndex *fmt_restore_fmt(const char *fn);
|
|||
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
|
||||
// 从文件中读取kmer hash信息
|
||||
KmerHash fmt_restore_kmer_idx(const char *fn);
|
||||
// 读取kmer bit数据
|
||||
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len);
|
||||
// 读取sa数据
|
||||
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
|
||||
// 根据interval-bwt创建fmt-index
|
||||
|
|
|
|||
10
run.sh
10
run.sh
|
|
@ -7,8 +7,14 @@ thread=1
|
|||
n_r1=~/fastq/sn_r1.fq
|
||||
n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
#n_r2=~/fastq/tiny_n_r2.fq
|
||||
#n_r2=~/fastq/tiny_n_r3.fq
|
||||
#n_r1=~/fastq/diff_r1.fq
|
||||
#n_r2=~/fastq/diff_r2.fq
|
||||
#n_r1=~/fastq/d_r1.fq
|
||||
#n_r2=~/fastq/d_r2.fq
|
||||
reference=~/reference/human_g1k_v37_decoy.fasta
|
||||
out=./fmt_all.sam
|
||||
#out=/dev/null
|
||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||
# /home/zzh/data/fastq/nm1.fq \
|
||||
|
|
@ -24,6 +30,6 @@ time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:n
|
|||
$reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
-o /dev/null
|
||||
-o $out
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue