解决了sa的bug,现在结果和原版一模一样

This commit is contained in:
zzh 2024-02-21 15:21:56 +08:00
parent fc2e0d9b0b
commit 17618ee5f2
7 changed files with 69 additions and 32 deletions

View File

@ -15,7 +15,7 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
fmt_idx.o
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread
LIBS= -lm -lz -lpthread -ldl
SUBDIRS= .
ifeq ($(shell uname -s),Linux)
@ -32,6 +32,9 @@ all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
#bwa:libbwa.a $(AOBJS) main.o
# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) ../sbwa/jemalloc/lib/libjemalloc.a main.o -o $@ -L. -lbwa $(LIBS)
bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)

14
bwa.c
View File

@ -279,14 +279,12 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
fprintf(stderr, "zzh-1\n");
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
fprintf(stderr, "zzh-1\n");
strcat(strcpy(tmp, prefix), ".33.2.sa"); // partial suffix array (SA)
//strcat(strcpy(tmp, prefix), ".33.4.sa"); // partial suffix array (SA)
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
fprintf(stderr, "zzh-after-sa\n");
free(tmp); free(prefix);
return bwt;
}
@ -330,7 +328,7 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
strcpy(sa_fn, hint);
sprintf(suffix, ".33.%d.sa", SA_INTV);
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
// fmt_restore_sa(sa_fn, fmt);
fmt_restore_sa(sa_fn, fmt);
free(fmt_idx_fn);
free(kmer_idx_fn);
@ -353,9 +351,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
if (which & BWA_IDX_BWT) {
idx->fmt = bwa_idx_load_fmt(hint);
// 先和bwt共用sa
idx->fmt->sa = idx->bwt->sa;
idx->fmt->n_sa = idx->bwt->n_sa;
idx->fmt->sa_intv = idx->bwt->sa_intv;
//idx->fmt->sa = idx->bwt->sa;
//idx->fmt->n_sa = idx->bwt->n_sa;
//idx->fmt->sa_intv = idx->bwt->sa_intv;
}
if (which & BWA_IDX_BNS)
{

View File

@ -138,6 +138,8 @@ static void smem_aux_destroy(smem_aux_t *a)
free(a);
}
#define USE_FMT 1
static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *a)
{
int i, k, x = 0, old_n;
@ -155,11 +157,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
// fprintf(stderr, "\n");
// first pass: find all SMEMs
//fprintf(fp1, "seq: %ld\n", dn++);
fprintf(fp1, "seq: %ld\n", dn++);
//fprintf(stderr, "seq: %ld\n", dn++);
//dn ++;
#define USE_FMT 1
// goto third_seed;
while (x < len) {
@ -186,7 +186,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
max_seed_len = fmax(max_seed_len, slen);
if (slen >= opt->min_seed_len) {
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
kv_push(bwtintv_t, a->mem, *p);
}
}
@ -220,11 +220,12 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
#endif
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
// fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
// fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
int slen = (uint32_t)p->info - (p->info >> 32);
if (slen >= opt->min_seed_len) {
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
}
@ -258,7 +259,8 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
if (m.x[2] > 0) {
kv_push(bwtintv_t, a->mem, m);
//bwtintv_t *p = &m;
bwtintv_t *p = &m;
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
}
} else { // for now, we never come to this block which is slower
@ -434,8 +436,15 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
// s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
#if USE_FMT
s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k);
uint64_t tpos = bwt_sa(bwt, p->x[0] + k);
if (s.rbeg != tpos) {
fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos);
}
#else
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
#endif
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_sa, tmp_time);
@ -1087,7 +1096,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
}
if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); }
if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);}
// if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);}
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }

18
bwt.c
View File

@ -77,7 +77,7 @@ void inline bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx);
// *sa_addr &= (1 << val_idx_in_block) - 1; // 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值
*sa_addr |= val << val_idx_in_block;
*sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block;
}
// 获取某一行的排序值(小端模式)
@ -129,7 +129,8 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
}
if (isa % intv == 0)
bwt_set_sa(bwt->sa, isa / intv, sa);
bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // before this line, bwt->sa[0] = bwt->seq_len
// bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // 赋值成-1也没问题set_sa那里已经修正了
bwt_set_sa(bwt->sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
}
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
@ -139,6 +140,7 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
++sa;
k = bwt_invPsi(bwt, k);
}
return sa + bwt->sa[k / bwt->sa_intv];
/* without setting bwt->sa[0] = -1, the following line should be
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
//#ifdef SHOW_PERF
@ -527,9 +529,15 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa));
fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa));
fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa);
//bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa));
//fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa));
//fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa);
bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa[0] = -1;
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
err_fclose(fp);
}

3
bwt.h
View File

@ -56,7 +56,8 @@ typedef struct {
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
//uint8_t *sa;
bwtint_t *sa;
} bwt_t;
typedef struct {

View File

@ -657,7 +657,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0)
// 处理kmer对应的匹配信息
for (j = 1, curr->n = 0; j < kmer_len; ++j) {
if (only_forward)
j = kmer_len - 1;
else
j = 1;
for (curr->n = 0; j < kmer_len; ++j)
{
fmt_kmer_get(fmt, &ok1, qbit, j);
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
}
@ -667,11 +672,15 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
// 扩展kmer之后的碱基
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_CHANGE(ik, ok1, i + 1);
CHECK_INTV_CHANGE(ik, ok2, i + 2);
// 在这里进行判断是否只有一个候选了
@ -806,6 +815,8 @@ backward_search:
for (j = 0; j < curr->n; ++j)
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
uint64_t qbit = 0;
if (!only_forward && p->info - x < HASH_KMER_LEN) {
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
@ -822,6 +833,8 @@ backward_search:
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
{
fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
@ -880,6 +893,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置
ik.info = x + kmer_len;
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
@ -894,6 +909,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
ik = ok2;

13
run.sh
View File

@ -4,19 +4,20 @@ thread=1
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
#reference=~/data/reference/human_g1k_v37_decoy.fasta
n_r1=~/fastq/sn_r1.fq
n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/sn_r1.fq
#n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/ssn_r1.fq
#n_r2=~/fastq/ssn_r2.fq
#n_r1=~/fastq/tiny_n_r1.fq
#n_r2=~/fastq/tiny_n_r2.fq
#n_r1=~/fastq/diff_r1.fq
#n_r2=~/fastq/diff_r2.fq
n_r1=~/fastq/diff_r1.fq
n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/d_r1.fq
#n_r2=~/fastq/d_r2.fq
reference=~/reference/human_g1k_v37_decoy.fasta
#out=./out.sam
out=/dev/null
#out=./ssn.sam
out=./out.sam
#out=/dev/null
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
# /home/zzh/data/fastq/nm1.fq \