解决了sa的bug,现在结果和原版一模一样
This commit is contained in:
parent
fc2e0d9b0b
commit
17618ee5f2
5
Makefile
5
Makefile
|
|
@ -15,7 +15,7 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
|||
fmt_idx.o
|
||||
PROG= bwa
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread
|
||||
LIBS= -lm -lz -lpthread -ldl
|
||||
SUBDIRS= .
|
||||
|
||||
ifeq ($(shell uname -s),Linux)
|
||||
|
|
@ -32,6 +32,9 @@ all:$(PROG)
|
|||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
#bwa:libbwa.a $(AOBJS) main.o
|
||||
# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) ../sbwa/jemalloc/lib/libjemalloc.a main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
bwamem-lite:libbwa.a example.o
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
|
|
|
|||
14
bwa.c
14
bwa.c
|
|
@ -279,14 +279,12 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
fprintf(stderr, "zzh-1\n");
|
||||
tmp = calloc(strlen(prefix) + 5, 1);
|
||||
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
||||
bwt = bwt_restore_bwt(tmp);
|
||||
fprintf(stderr, "zzh-1\n");
|
||||
strcat(strcpy(tmp, prefix), ".33.2.sa"); // partial suffix array (SA)
|
||||
//strcat(strcpy(tmp, prefix), ".33.4.sa"); // partial suffix array (SA)
|
||||
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
||||
bwt_restore_sa(tmp, bwt);
|
||||
fprintf(stderr, "zzh-after-sa\n");
|
||||
free(tmp); free(prefix);
|
||||
return bwt;
|
||||
}
|
||||
|
|
@ -330,7 +328,7 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
strcpy(sa_fn, hint);
|
||||
sprintf(suffix, ".33.%d.sa", SA_INTV);
|
||||
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
|
||||
// fmt_restore_sa(sa_fn, fmt);
|
||||
fmt_restore_sa(sa_fn, fmt);
|
||||
|
||||
free(fmt_idx_fn);
|
||||
free(kmer_idx_fn);
|
||||
|
|
@ -353,9 +351,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
|||
if (which & BWA_IDX_BWT) {
|
||||
idx->fmt = bwa_idx_load_fmt(hint);
|
||||
// 先和bwt共用sa
|
||||
idx->fmt->sa = idx->bwt->sa;
|
||||
idx->fmt->n_sa = idx->bwt->n_sa;
|
||||
idx->fmt->sa_intv = idx->bwt->sa_intv;
|
||||
//idx->fmt->sa = idx->bwt->sa;
|
||||
//idx->fmt->n_sa = idx->bwt->n_sa;
|
||||
//idx->fmt->sa_intv = idx->bwt->sa_intv;
|
||||
}
|
||||
if (which & BWA_IDX_BNS)
|
||||
{
|
||||
|
|
|
|||
29
bwamem.c
29
bwamem.c
|
|
@ -138,6 +138,8 @@ static void smem_aux_destroy(smem_aux_t *a)
|
|||
free(a);
|
||||
}
|
||||
|
||||
#define USE_FMT 1
|
||||
|
||||
static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *a)
|
||||
{
|
||||
int i, k, x = 0, old_n;
|
||||
|
|
@ -155,11 +157,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
// fprintf(stderr, "\n");
|
||||
|
||||
// first pass: find all SMEMs
|
||||
//fprintf(fp1, "seq: %ld\n", dn++);
|
||||
fprintf(fp1, "seq: %ld\n", dn++);
|
||||
//fprintf(stderr, "seq: %ld\n", dn++);
|
||||
//dn ++;
|
||||
|
||||
#define USE_FMT 1
|
||||
// goto third_seed;
|
||||
|
||||
while (x < len) {
|
||||
|
|
@ -186,7 +186,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
max_seed_len = fmax(max_seed_len, slen);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
kv_push(bwtintv_t, a->mem, *p);
|
||||
}
|
||||
}
|
||||
|
|
@ -220,11 +220,12 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
#endif
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
// fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
// fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||
}
|
||||
}
|
||||
|
|
@ -258,7 +259,8 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
|
||||
if (m.x[2] > 0) {
|
||||
kv_push(bwtintv_t, a->mem, m);
|
||||
//bwtintv_t *p = &m;
|
||||
bwtintv_t *p = &m;
|
||||
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
}
|
||||
} else { // for now, we never come to this block which is slower
|
||||
|
|
@ -434,8 +436,15 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
|
|||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
// s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
#if USE_FMT
|
||||
s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k);
|
||||
uint64_t tpos = bwt_sa(bwt, p->x[0] + k);
|
||||
if (s.rbeg != tpos) {
|
||||
fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos);
|
||||
}
|
||||
#else
|
||||
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
#endif
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_sa, tmp_time);
|
||||
|
|
@ -1087,7 +1096,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
|
|||
kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
|
||||
}
|
||||
if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); }
|
||||
if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);}
|
||||
// if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);}
|
||||
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
||||
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
||||
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
|
||||
|
|
|
|||
18
bwt.c
18
bwt.c
|
|
@ -77,7 +77,7 @@ void inline bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
|
|||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx);
|
||||
// *sa_addr &= (1 << val_idx_in_block) - 1; // 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值
|
||||
*sa_addr |= val << val_idx_in_block;
|
||||
*sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block;
|
||||
}
|
||||
|
||||
// 获取某一行的排序值(小端模式)
|
||||
|
|
@ -129,7 +129,8 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
|
|||
}
|
||||
if (isa % intv == 0)
|
||||
bwt_set_sa(bwt->sa, isa / intv, sa);
|
||||
bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // before this line, bwt->sa[0] = bwt->seq_len
|
||||
// bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了
|
||||
bwt_set_sa(bwt->sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
|
||||
}
|
||||
|
||||
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
||||
|
|
@ -139,6 +140,7 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
|||
++sa;
|
||||
k = bwt_invPsi(bwt, k);
|
||||
}
|
||||
return sa + bwt->sa[k / bwt->sa_intv];
|
||||
/* without setting bwt->sa[0] = -1, the following line should be
|
||||
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
||||
//#ifdef SHOW_PERF
|
||||
|
|
@ -527,9 +529,15 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
|||
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
|
||||
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
||||
bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa));
|
||||
fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa));
|
||||
fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa);
|
||||
|
||||
//bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa));
|
||||
//fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa));
|
||||
//fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa);
|
||||
|
||||
bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
bwt->sa[0] = -1;
|
||||
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
|
||||
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
|
|
|
|||
3
bwt.h
3
bwt.h
|
|
@ -56,7 +56,8 @@ typedef struct {
|
|||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
uint8_t *sa;
|
||||
//uint8_t *sa;
|
||||
bwtint_t *sa;
|
||||
} bwt_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
|
|||
19
fmt_idx.c
19
fmt_idx.c
|
|
@ -657,7 +657,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0)
|
||||
|
||||
// 处理kmer对应的匹配信息
|
||||
for (j = 1, curr->n = 0; j < kmer_len; ++j) {
|
||||
if (only_forward)
|
||||
j = kmer_len - 1;
|
||||
else
|
||||
j = 1;
|
||||
for (curr->n = 0; j < kmer_len; ++j)
|
||||
{
|
||||
fmt_kmer_get(fmt, &ok1, qbit, j);
|
||||
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
|
||||
}
|
||||
|
|
@ -667,11 +672,15 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_CHANGE(ik, ok1, i + 1);
|
||||
CHECK_INTV_CHANGE(ik, ok2, i + 2);
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
|
|
@ -806,6 +815,8 @@ backward_search:
|
|||
for (j = 0; j < curr->n; ++j)
|
||||
{
|
||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
||||
uint64_t qbit = 0;
|
||||
if (!only_forward && p->info - x < HASH_KMER_LEN) {
|
||||
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
||||
|
|
@ -822,6 +833,8 @@ backward_search:
|
|||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
||||
|
|
@ -880,6 +893,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
|
||||
fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置
|
||||
ik.info = x + kmer_len;
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
|
||||
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
|
||||
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
|
||||
|
|
@ -894,6 +909,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
|
||||
ik = ok2;
|
||||
|
|
|
|||
13
run.sh
13
run.sh
|
|
@ -4,19 +4,20 @@ thread=1
|
|||
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
n_r1=~/fastq/sn_r1.fq
|
||||
n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/sn_r1.fq
|
||||
#n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/ssn_r1.fq
|
||||
#n_r2=~/fastq/ssn_r2.fq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
#n_r2=~/fastq/tiny_n_r2.fq
|
||||
#n_r1=~/fastq/diff_r1.fq
|
||||
#n_r2=~/fastq/diff_r2.fq
|
||||
n_r1=~/fastq/diff_r1.fq
|
||||
n_r2=~/fastq/diff_r2.fq
|
||||
#n_r1=~/fastq/d_r1.fq
|
||||
#n_r2=~/fastq/d_r2.fq
|
||||
reference=~/reference/human_g1k_v37_decoy.fasta
|
||||
#out=./out.sam
|
||||
out=/dev/null
|
||||
#out=./ssn.sam
|
||||
out=./out.sam
|
||||
#out=/dev/null
|
||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||
# /home/zzh/data/fastq/nm1.fq \
|
||||
|
|
|
|||
Loading…
Reference in New Issue