diff --git a/.vscode/launch.json b/.vscode/launch.json index 54910f9..170c3c5 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -18,8 +18,8 @@ "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "~/reference/human_g1k_v37_decoy.fasta", - "~/fastq/d_r1.fq", - "~/fastq/d_r2.fq", + "~/fastq/tiny_n_r1.fq", + "~/fastq/tiny_n_r2.fq", "-o", "/dev/null" ], diff --git a/bwa.c b/bwa.c index cc16c59..0b28afb 100644 --- a/bwa.c +++ b/bwa.c @@ -284,7 +284,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) strcat(strcpy(tmp, prefix), ".bwt"); // FM-index bwt = bwt_restore_bwt(tmp); fprintf(stderr, "zzh-1\n"); - strcat(strcpy(tmp, prefix), ".33.4.sa"); // partial suffix array (SA) + strcat(strcpy(tmp, prefix), ".33.2.sa"); // partial suffix array (SA) bwt_restore_sa(tmp, bwt); fprintf(stderr, "zzh-after-sa\n"); free(tmp); free(prefix); @@ -293,13 +293,14 @@ bwt_t *bwa_idx_load_bwt(const char *hint) FMTIndex *bwa_idx_load_fmt(const char *hint) { - char *fmt_idx_fn, *kmer_idx_fn, *kmer_bit_fn, *sa_fn; + char *fmt_idx_fn, *kmer_idx_fn, *sa_fn; + // char *kmer_bit_fn; FMTIndex *fmt; char suffix[32]; int l_hint = strlen(hint); fmt_idx_fn = malloc(l_hint + 32); kmer_idx_fn = malloc(l_hint + 32); - kmer_bit_fn = malloc(l_hint + 32); + //kmer_bit_fn = malloc(l_hint + 32); sa_fn = malloc(l_hint + 32); sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL); strcpy(fmt_idx_fn, hint); @@ -319,12 +320,12 @@ FMTIndex *bwa_idx_load_fmt(const char *hint) fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn); // restore kmer bit - fmt->bit_kmer_len = 18; - sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len); - strcpy(kmer_bit_fn, hint); - strcpy(kmer_bit_fn + l_hint, suffix); - fprintf(stderr, "%s\n", kmer_bit_fn); - fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len); +// fmt->bit_kmer_len = 18; +// sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len); +// strcpy(kmer_bit_fn, hint); +// strcpy(kmer_bit_fn + l_hint, suffix); +// fprintf(stderr, "%s\n", kmer_bit_fn); +// fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len); strcpy(sa_fn, hint); sprintf(suffix, ".33.%d.sa", SA_INTV); @@ -369,6 +370,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence err_fclose(idx->bns->fp_pac); idx->bns->fp_pac = 0; + // 赋值到fmt中对应的pac + idx->fmt->l_pac = idx->bns->l_pac; + idx->fmt->pac = idx->pac; } } free(prefix); diff --git a/bwamem.c b/bwamem.c index db8ac2d..597651a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -158,7 +158,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn //fprintf(fp1, "seq: %ld\n", dn++); //fprintf(stderr, "seq: %ld\n", dn++); //dn ++; - //goto third_seed; + +#define USE_FMT 1 + // goto third_seed; while (x < len) { if (seq[x] < 4) { @@ -166,25 +168,27 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - //x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); +#if USE_FMT x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv); + +#else + x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_smem1a, tmp_time); + __sync_fetch_and_add(&time_seed_1, tmp_time); #endif for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; + //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); int slen = (uint32_t)p->info - (p->info >> 32); // seed length - //if (slen < 16) ++n16; - //if (slen < 17) ++n17; - //if (slen < 18) ++n18; - //if (slen < 19) ++n19; - //++nall; max_seed_len = fmax(max_seed_len, slen); - if (slen >= opt->min_seed_len) + if (slen >= opt->min_seed_len) { + //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); kv_push(bwtintv_t, a->mem, *p); + } } } else { ++x; @@ -195,7 +199,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn // second pass: find MEMs inside a long SMEM //if (max_seed_len == len - start_N_num) // goto collect_intv_end; - // goto third_seed; + //goto third_seed; old_n = a->mem.n; for (k = 0; k < old_n; ++k) { @@ -205,24 +209,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - //bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); +#if USE_FMT fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); +#else + bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_smem1a, tmp_time); + __sync_fetch_and_add(&time_seed_2, tmp_time); #endif for (i = 0; i < a->mem1.n; ++i) { - //bwtintv_t *p = &a->mem1.a[i]; + bwtintv_t *p = &a->mem1.a[i]; //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - //int slen = (uint32_t)p->info - (p->info >> 32); - //if (slen < 16) ++n16; - //if (slen < 17) ++n17; - //if (slen < 18) ++n18; - //if (slen < 19) ++n19; - //++nall; - if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len) + int slen = (uint32_t)p->info - (p->info >> 32); + if (slen >= opt->min_seed_len) { + //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } } } //if (max_seed_len == len - start_N_num) @@ -239,11 +243,15 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - //x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); + +#if USE_FMT x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); +#else + x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); +#endif #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_smem1a, tmp_time); + __sync_fetch_and_add(&time_seed_3, tmp_time); #endif //bwtintv_t *p = &m; //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); @@ -262,7 +270,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn } } -//collect_intv_end: +collect_intv_end: // sort ks_introsort(mem_intv, a->mem.n, a->mem.a); } @@ -381,6 +389,43 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive + //if (0) { + if (p->num_match > 0) { + //continue; + for (k = 0; k < p->num_match; ++k) { + mem_chain_t tmp, *lower, *upper; + mem_seed_t s; + int rid, to_add = 0; + s.rbeg = p->rm[k].rs; + if (p->rm[k].reverse) { + s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg; + } + tmp.pos = s.rbeg; + s.qbeg = p->info >> 32; + s.score = s.len = slen; + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + if (rid < 0) continue; + if (kb_size(tree)) + { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) + to_add = 1; + } + else + to_add = 1; + if (to_add) + { // add the seed as a new chain + tmp.n = 1; + tmp.m = 4; + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds[0] = s; + tmp.rid = rid; + tmp.is_alt = !!bns->anns[rid].is_alt; + kb_putp(chn, tree, &tmp); + } + } + continue; + } step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { mem_chain_t tmp, *lower, *upper; @@ -389,14 +434,23 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference +// s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference + s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k); #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_bwt_sa, tmp_time); + __sync_fetch_and_add(&num_sa, 1); #endif s.qbeg = p->info >> 32; s.score= s.len = slen; +#ifdef SHOW_PERF + tmp_time = realtime_msec(); +#endif rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); +#ifdef SHOW_PERF + tmp_time = realtime_msec() - tmp_time; + __sync_fetch_and_add(&time_bns, tmp_time); +#endif if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!! if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain diff --git a/bwt.c b/bwt.c index e53fe03..3255ae4 100644 --- a/bwt.c +++ b/bwt.c @@ -61,6 +61,10 @@ static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inver x = t2; +// bwtint_t cnt[4], cnt1; +// bwt_occ4(bwt, k, cnt); +// cnt1 = bwt_occ(bwt, k, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引), return k == bwt->primary? 0 : x; } @@ -137,14 +141,14 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) } /* without setting bwt->sa[0] = -1, the following line should be changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ -#ifdef SHOW_PERF - int64_t tmp_time = realtime_msec(); -#endif +//#ifdef SHOW_PERF +// int64_t tmp_time = realtime_msec(); +//#endif sa += bwt_get_sa(bwt->sa, k / bwt->sa_intv); -#ifdef SHOW_PERF - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_sa_read, tmp_time); -#endif +//#ifdef SHOW_PERF +// tmp_time = realtime_msec() - tmp_time; +// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time); +//#endif return sa; } @@ -354,7 +358,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; - bwtintv_t ik, ok[4]; + bwtintv_t ik = {0}, ok[4] = {0}; bwtintv_v a[2], *prev, *curr, *swap; mem->n = 0; @@ -447,7 +451,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) { int i, c; - bwtintv_t ik, ok[4]; + bwtintv_t ik = {0}, ok[4] = {0}; memset(mem, 0, sizeof(bwtintv_t)); if (q[x] > 3) return x + 1; diff --git a/bwt.h b/bwt.h index 158288e..341a1c8 100644 --- a/bwt.h +++ b/bwt.h @@ -59,8 +59,17 @@ typedef struct { uint8_t *sa; } bwt_t; +typedef struct { + int qs; // query start + int qe; // query end [) + int reverse; // 反向链 + uint32_t rs; // 参考基因的起始点(包含) +} ref_match_t; + typedef struct { bwtint_t x[3], info; + int num_match; + ref_match_t rm[2]; } bwtintv_t; typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; diff --git a/fastmap.c b/fastmap.c index 503ca15..888b0bf 100644 --- a/fastmap.c +++ b/fastmap.c @@ -48,13 +48,17 @@ int64_t time_ksw_extend2 = 0, time_ksw_global2 = 0, time_ksw_align2 = 0, time_bwt_smem1a = 0, + time_seed_1 = 0, + time_seed_2 = 0, + time_seed_3 = 0, time_fmt_smem_0 = 0, time_bwt_extend = 0, time_bwt_occ4 = 0, time_bwt_sa = 0, - time_bwt_sa_read = 0; + time_bwt_sa_read = 0, + time_bns = 0; -int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0; +int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0; FILE *fp1; @@ -381,7 +385,6 @@ int main_mem(int argc, char *argv[]) } } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); - fprintf(stderr, "zzh-1\n"); aux.idx = bwa_idx_load_from_shm(argv[optind]); if (aux.idx == 0) { if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak @@ -396,7 +399,6 @@ int main_mem(int argc, char *argv[]) if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); return 1; } - fprintf(stderr, "zzh-2\n"); fp = gzdopen(fd, "r"); aux.ks = kseq_init(fp); if (optind + 2 < argc) { @@ -432,16 +434,21 @@ int main_mem(int argc, char *argv[]) #ifdef SHOW_PERF fprintf(stderr, "\n"); fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads); + fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads); + fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads); + fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads); fprintf(stderr, "time_fmt_smem_0: %f s\n", time_fmt_smem_0 / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_extend: %f s\n", time_bwt_extend / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads); + fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads); fprintf(stderr, "all seed num: %ld\n", nall); fprintf(stderr, "all seed num litter than 19: %ld\n", n19); fprintf(stderr, "all seed num litter than 18: %ld\n", n18); fprintf(stderr, "all seed num litter than 17: %ld\n", n17); fprintf(stderr, "all seed num litter than 16: %ld\n", n16); + fprintf(stderr, "get sa num: %ld\n", num_sa); fprintf(stderr, "\n"); fclose(fp1); diff --git a/fmt_idx.c b/fmt_idx.c index 8a3f834..70e05ad 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -357,7 +357,7 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t { uint32_t x = 0; uint32_t *p, *q, tmp; - bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); + bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); // cp = check point int i, ti = b1 << 2 | b2; cnt[0] = 0; cnt[1] = 0; @@ -462,7 +462,7 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti #endif #endif bwtint_t tk[4], tl[4]; - bwtintv_t intv; + bwtintv_t intv = {0}; // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk); fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl); @@ -629,95 +629,18 @@ inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int return (~qbit) & ((1L << (kmer_len << 1)) - 1); } -// 当x为0,或者q[x-1]为N时,只需要前向搜索即可 -int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwtintv_v *mem) -{ -#ifdef SHOW_PERF -#if 1 - int64_t tmp_time = realtime_msec(); -#endif -#endif - int i, j = 1, ret, kmer_len; - const int min_intv = 1; - bwtintv_t ik, ok1, ok2; - uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); - mem->n = 0; - - fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++); - while (ik.x[2] == 0) { - fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++); - } - if (j != 2) { // kmer hash没有找到对应的匹配 - ik.info = x + kmer_len - j + 2; - goto fmt_smem_forward_end; - } - ik.info = x + kmer_len; - - // 继续向前扩展 - for (i = x + kmer_len; i + 1 < len; i += 2) { - if (ik.x[2] < 2) - goto fmt_smem_forward_end; - if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展 - { - fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); - if (ok2.x[2] >= min_intv) { // 可以继续扩展 - ik = ok2; ik.info = i + 2; - } else if (ok1.x[2] >= min_intv) { // 第二个间隔不够 - ik = ok1; ik.info = i + 1; - goto fmt_smem_forward_end; - } else { // 两个间隔都不够 - goto fmt_smem_forward_end; - } - } - else if (q[i] < 4) // q[i+1] >= 4,只能扩展一个 - { - fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); - if (ok1.x[2] >= min_intv) { - ik = ok1; ik.info = i + 1; - } - goto fmt_smem_forward_end; - } - else { // q[i] >= 4 - goto fmt_smem_forward_end; - } - } - if (i == len - 1) // 扩展到了最后一个碱基 - { - if (q[i] < 4) { - fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); - if (ok1.x[2] >= min_intv) { - ik = ok1; ik.info = i + 1; - } - } - } - -fmt_smem_forward_end: - ret = ik.info; - ik.info |= (uint64_t)x << 32; - if (fmt->bit_kmer_len <= (uint32_t)ik.info - (ik.info >> 32)) - kv_push(bwtintv_t, *mem, ik); -#ifdef SHOW_PERF -#if 1 - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_fmt_smem_0, tmp_time); -#endif -#endif - return ret; - //return len; -} - // 找smem(seed) int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, ret, kmer_len; - bwtintv_t ik, ok1, ok2; + bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; + bwtintv_t mt = {0}; bwtintv_v a[1], *curr; uint32_t qbit = 0; mem->n = 0; int only_forward = x == 0 || q[x - 1] > 3; if (q[x] > 3) return x + 1; - //if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展 if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); @@ -741,6 +664,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后 PUSH_VAL_AND_SKIP(ik); +#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3) + // 扩展kmer之后的碱基 for (i = (int)ik.info; i + 1 < len; i += 2) { // forward search @@ -749,6 +674,99 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); CHECK_INTV_CHANGE(ik, ok1, i + 1); CHECK_INTV_CHANGE(ik, ok2, i + 2); + // 在这里进行判断是否只有一个候选了 + //if (0) + if (min_intv < 2 && ok2.x[2] == min_intv) + { // 最多处理两个 + mt.num_match = min_intv; + int64_t r, rp; + int k; + int max_qs = 0, min_qe = len; + for (j = 0; j < ok2.x[2]; ++j) + { + rp = fmt_sa(fmt, ok2.x[0] + j); + r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp; + k = i + 2; + if (rp < fmt->l_pac) // 匹配到了正向链 + { + // 向前继续扩展 + r += i + 2 - x; + while (k < len && r < fmt->l_pac) { + int base = PAC_BASE(fmt->pac, r); + if (q[k] != base) + break; + ++k; + ++r; + } + mt.rm[j].qe = k; + mt.rm[j].reverse = 0; + // 向后扩展,x位置之前的碱基 + r -= k - x + 1; + k = x - 1; + while (k > -1 && r > -1) { + int base = PAC_BASE(fmt->pac, r); + if (q[k] != base) + break; + --k; + --r; + } + mt.rm[j].qs = k + 1; + mt.rm[j].rs = r + 1; + } + else // 匹配到了互补链 + { + r -= i + 2 - x; + while (k < len && r > -1) + { + const int base = 3 - PAC_BASE(fmt->pac, r); + if (q[k] != base) + break; + ++k; + --r; + } + mt.rm[j].qe = k; + mt.rm[j].reverse = 1; + // 扩展x之前的碱基 + r += k - x + 1; + k = x - 1; + while (k > -1 && r < fmt->l_pac) + { + const int base = 3 - PAC_BASE(fmt->pac, r); + if (q[k] != base) + break; + --k; + ++r; + } + mt.rm[j].qs = k + 1; + mt.rm[j].rs = r - 1; + } + max_qs = max_qs < mt.rm[j].qs ? mt.rm[j].qs : max_qs; + min_qe = min_qe > mt.rm[j].qe ? mt.rm[j].qe : min_qe; + } + if (min_intv > 1) + { + // 修正两个匹配的位置,使比对的碱基个数和位置一致 + for (j = 0; j < min_intv; ++j) + { + if (mt.rm[j].reverse) + mt.rm[j].rs -= max_qs - mt.rm[j].qs; + else + mt.rm[j].rs += max_qs - mt.rm[j].qs; + mt.rm[j].qs = max_qs; + mt.rm[j].qe = min_qe; + } + } + mt.info = mt.rm[0].qs; + mt.info = mt.info << 32 | mt.rm[0].qe; + mt.x[2] = min_intv; + kv_push(bwtintv_t, *mem, mt); + ret = (uint32_t)mt.info; + if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + { + goto fmt_smem_end; + } + goto backward_search; + } } else if (q[i] < 4) // q[i+1] >= 4 { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); @@ -775,7 +793,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv backward_search: fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first - ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 + if (mt.num_match == 0) + ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 // 按照种子进行遍历,反向扩展 #define CHECK_ADD_MEM(pos, intv, mem) \ @@ -788,18 +807,8 @@ backward_search: { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 uint64_t qbit = 0; - // if (p->info - x < fmt->bit_kmer_len) { - // qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer - // if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf)))) - // continue; - // } if (!only_forward && p->info - x < HASH_KMER_LEN) { - //qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer - //if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf)))) - // continue; - //qbit >>= (fmt->bit_kmer_len - HASH_KMER_LEN) << 1; - //kmer_len = kmer_len > HASH_KMER_LEN ? HASH_KMER_LEN : kmer_len; i = 1; do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv); if (i > 2) continue; @@ -859,7 +868,7 @@ fmt_smem_end: int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) { int i, kmer_len; - bwtintv_t ik, ok1, ok2; + bwtintv_t ik = {0}, ok1={0}, ok2={0}; uint64_t qbit; @@ -905,4 +914,64 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); } return len; +} + +// 这里的k是bwt str的行 +inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2) +{ + uint32_t *p; + uint8_t base2; + + // 第一步,找到check point位置 + p = fmt_occ_intv(fmt, k); // check point起始位置 + p += 20; // bwt碱基起始位置 + // 第二步,找到mid check point位置 + int mk = k & FMT_OCC_INTV_MASK; + int n_mintv = mk >> FMT_MID_INTV_SHIFT; + p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置 + // 第三步,找到具体的uint32_t + p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基) + // 第四步,获取碱基 + + base2 = *p >> ((~(k) & 0x7) << 2) & 0xf; + *b2 = base2 >> 2 & 3; + *b1 = base2 & 3; +} + +// k, k1, k2都是bwt矩阵对应的行 +inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) +{ + uint8_t b1, b2; + bwtint_t tk[4], kk; + kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) + fmt_get_previous_base(fmt, kk, &b1, &b2); + fmt_e2_occ(fmt, k, b1, b2, tk); + *k1 = fmt->L2[b1] + tk[1]; + *k2 = fmt->L2[b2] + tk[3]; +} + +bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) +{ + bwtint_t sa = 0, mask = fmt->sa_intv - 1; + bwtint_t k1, k2; + while (k & mask) + { + ++sa; + fmt_previous_line(fmt, k, &k1, &k2); + if (!(k1 & mask)) { + k = k1; + break; + } + ++sa; + k = k2; + } +//#ifdef SHOW_PERF +// int64_t tmp_time = realtime_msec(); +//#endif + sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv); +//#ifdef SHOW_PERF +// tmp_time = realtime_msec() - tmp_time; +// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time); +//#endif + return sa; } \ No newline at end of file diff --git a/fmt_idx.h b/fmt_idx.h index 07efff6..0252c8e 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -17,7 +17,7 @@ Date : 2023/12/24 #define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) -#define FMT_MID_INTV_SHIFT 5 +#define FMT_MID_INTV_SHIFT 6 #define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) @@ -91,8 +91,12 @@ typedef struct uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 uint8_t last_base; // dollar转换成的base + // ref pac相关 + bwtint_t l_pac; // 参考序列长度 + uint8_t *pac; // 保存2bit编码的参考序列 // 保存kmer对应的fmt位置信息 KmerHash kmer_hash; + // kmer_bit好像效果一般 uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配 int bit_kmer_len; // suffix array @@ -132,4 +136,7 @@ void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos); int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); + +bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k); + #endif \ No newline at end of file diff --git a/run.sh b/run.sh index 610b9b0..241f1a6 100755 --- a/run.sh +++ b/run.sh @@ -4,10 +4,10 @@ thread=1 #n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq #n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq #reference=~/data/reference/human_g1k_v37_decoy.fasta -#n_r1=~/fastq/sn_r1.fq -#n_r2=~/fastq/sn_r2.fq n_r1=~/fastq/sn_r1.fq n_r2=~/fastq/sn_r2.fq +#n_r1=~/fastq/ssn_r1.fq +#n_r2=~/fastq/ssn_r2.fq #n_r1=~/fastq/tiny_n_r1.fq #n_r2=~/fastq/tiny_n_r2.fq #n_r1=~/fastq/diff_r1.fq diff --git a/utils.h b/utils.h index 882fa6b..b2c8b81 100644 --- a/utils.h +++ b/utils.h @@ -37,13 +37,17 @@ extern int64_t time_ksw_extend2, time_ksw_global2, time_ksw_align2, time_bwt_smem1a, + time_seed_1, + time_seed_2, + time_seed_3, time_fmt_smem_0, time_bwt_extend, time_bwt_occ4, time_bwt_sa, - time_bwt_sa_read; + time_bwt_sa_read, + time_bns; -extern int64_t dn, n16, n17, n18, n19, nall; +extern int64_t dn, n16, n17, n18, n19, nall, num_sa; extern FILE *fp1;