diff --git a/.gitignore b/.gitignore index 47e7738..fe9a09d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +output/ *.[oa] *.txt *.sam diff --git a/.vscode/launch.json b/.vscode/launch.json index 110a3b5..7c5134d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,9 +17,9 @@ "-M", "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", - "~/reference/bwa/human_g1k_v37_decoy.fasta", - "~/data/fastq/dataset/na12878_wes_144/s_1.fq", - "~/data/fastq/dataset/na12878_wes_144/s_2.fq", + "~/data1/fmt_ref/human_g1k_v37_decoy.fasta", + "~/data/fastq/dataset/na12878_wes_144/1w_1.fq", + "~/data/fastq/dataset/na12878_wes_144/1w_2.fq", "-o", "/dev/null" ], diff --git a/.vscode/settings.json b/.vscode/settings.json index 691d63c..f62b57e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -29,6 +29,12 @@ "scalar_sse.h": "c", "immintrin.h": "c", "ksw.h": "c", - "debug.h": "c" + "debug.h": "c", + "type_traits": "c", + "cstdint": "c", + "bitset": "c", + "iterator": "c", + "memory": "c", + "__locale": "c" } } \ No newline at end of file diff --git a/Makefile b/Makefile index da11aa1..fb6e41b 100644 --- a/Makefile +++ b/Makefile @@ -5,8 +5,8 @@ CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF -SHOW_DATA_PERF= #-DSHOW_DATA_PERF -FILTER_FULL_MATCH= -DFILTER_FULL_MATCH +SHOW_DATA_PERF= -DSHOW_DATA_PERF +FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH USE_MT_READ= -DUSE_MT_READ AR= ar diff --git a/bwamem.c b/bwamem.c index c1609ec..29237b6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -142,7 +142,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, kv_push(bwtintv_t, a->mem, *p); } } - //break; // for test full match time } else { ++x; if (start_flag) ++start_N_num; @@ -202,7 +201,6 @@ KBTREE_INIT(chn, mem_chain_t, chain_cmp) // return 1 if the seed is merged into the chain static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) { - //return 0; int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; @@ -340,30 +338,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { mem_seed_t s; s.rbeg = fmt_sa(fmt, p->x[0] + k); - - //kv_push(uint64_t, v1, s.rbeg); - //fprintf(stderr, "%ld\t", s.rbeg); - //if (p->x[0] == 0) - // s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + p->x[2] - 1 - k); - //else - // s.rbeg = fmt_sa(fmt, p->x[0] + k); - //kv_push(uint64_t, v2, s.rbeg); - //fprintf(stderr, "%ld\t", s.rbeg); - //s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + k); - //fprintf(stderr, "%ld\n", s.rbeg); - //if (s.rbeg < fmt->l_pac) { // 在正链 - // s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg; - //} else { // 在反向互补链 - // s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg; - //} - //s.rbeg = fmt_sa(fmt, p->x[0] + k); - CHECK_ADD_CHAIN(tmp, lower, upper); } - //fprintf(stderr, "\n"); } } - //fprintf(stderr, "split\n"); kv_resize(mem_chain_t, chain, kb_size(tree)); #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) @@ -374,28 +352,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); kb_destroy(chn, tree); -#if 0 - for (i = 0; i < chain.n; ++i) - { - fprintf(gf[0], "chain-begin: %d\n", i); - int j; - fprintf(gf[0], "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid); - for (j = 0; j < chain.a[i].n; ++j) { - fprintf(gf[0], "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len); - } - fprintf(gf[0], "chain-end\n"); - } - fprintf(gf[0], "fun-end\n"); -#endif - // ks_introsort(uint64_sort, v1.n, v1.a); - // ks_introsort(uint64_sort, v2.n, v2.a); - // for (i = 0; i < v1.n; ++i) { - // //fprintf(stderr, "%ld\t%ld\n", v1.a[i], v2.a[i]); - // if (v1.a[i] != v2.a[i]) { - // fprintf(stderr, "diff: %ld\t%ld\n", v1.a[i], v2.a[i]); - // } - // } - return chain; } @@ -1270,21 +1226,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * return a; } -static void worker1(void *data, int i, int tid) -{ - mem_worker_t *w = (mem_worker_t*)data; - if (!(w->opt->flag&MEM_F_PE)) { - if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); - w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); - } else { - if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); - w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); - if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); - w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); - } -} - -static void worker2(void *data, int i, int tid) +static void worker_sam(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); @@ -1346,8 +1288,6 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b } return w; } -#include "khash.h" -KHASH_MAP_INIT_INT64(seed, uint64_t); static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid) { @@ -1357,24 +1297,13 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in int max_seed_len = 0; int start_N_num = 0, start_flag = 1; smem->mem.n = 0; - //int s1_num = 0; // 1. first pass: find all SMEMs + // goto third_seed; PROF_START(seed_1); - // fprintf(gf[0], "read\n"); while (x < len) { if (seq[x] < 4) { start_flag = 0; -#ifdef DEBUG_FILE_OUTPUT -#ifdef COUNT_SEED_LENGTH - int last_x = x; -#endif -#endif x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]); -#ifdef DEBUG_FILE_OUTPUT -#ifdef COUNT_SEED_LENGTH - fprintf(gf[0], "%d\n", x - last_x); -#endif -#endif for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info >> 32); // seed length @@ -1383,82 +1312,38 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in kv_push(bwtintv_t, smem->mem, *p); } } -#ifdef COUNT_SEED_LENGTH - break; // for test full match time -#endif + // break; // for test } else { ++x; if (start_flag) ++start_N_num; } } - -#ifdef SHOW_DATA_PERF - gdat[0] += 1; // read num ++ - if (max_seed_len == len - start_N_num) - gdat[1] += 1; // seed-1 full match num ++ -#endif - PROF_END(tprof[T_SEED_1][tid], seed_1); - + //goto third_seed; #ifdef FILTER_FULL_MATCH if (max_seed_len == len - start_N_num) { - -#ifdef DEBUG_FILE_OUTPUT - if (start_N_num == 0) { -#ifdef GET_FULL_MATCH_READ - for (i = 0; i < len; ++i) - fprintf(gf[0], "%c", "ACGT"[seq[i]]); - fprintf(gf[0], "\n"); -#endif -#ifdef SHOW_DATA_PERF - gdat[2]++; - if (gdat[2] % 100 == 0) { - fprintf(stderr, "reads num: %ld\n", gdat[2]); - } -#endif - } -#endif - //goto third_seed; goto collect_intv_end; } #endif - bwtintv_v mem2 = {0}; // 2. second pass: find MEMs inside a long SMEM - uint32_v qev; PROF_START(seed_2); old_n = smem->mem.n; - //fprintf(gf[2], "seed-1\n"); for (k = 0; k < old_n; ++k) { bwtintv_t *p = &smem->mem.a[k]; int start = p->info>>32, end = (int32_t)p->info; if (end - start < split_len || p->x[2] > opt->split_width) continue; - //++ s1_num; - //fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", p->info >> 32, (int)p->info, p->x[0], p->x[1], p->x[2]); - // fprintf(gf[0], "start\n"); - fmt_smem_2(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0], qev); + fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info >> 32); -// fprintf(gf[0], "%d\n", slen); if (slen >= opt->min_seed_len) { kv_push(bwtintv_t, smem->mem, a->mem1.a[i]); - kv_push(bwtintv_t, mem2, a->mem1.a[i]); } } -// fprintf(gf[0], "end\n"); } - //fprintf(gf[2], "seed-2\n"); PROF_END(tprof[T_SEED_2][tid], seed_2); - //if (s1_num > 1) { - //ks_introsort(mem_intv, mem2.n, mem2.a); - //for (i = 0; i < mem2.n; ++i) { - // fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", mem2.a[i].info >> 32, (int)mem2.a[i].info, mem2.a[i].x[0], mem2.a[i].x[1], mem2.a[i].x[2]); - //} - //fprintf(gf[2], "\n"); - //} -// third_seed: -#if 1 +//third_seed: // 3. third pass: LAST-like PROF_START(seed_3); if (opt->max_mem_intv > 0) { @@ -1468,32 +1353,23 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in bwtintv_t m; x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); if (m.x[2] > 0) { - // fprintf(stderr, "3: %ld %ld %ld\n", m.info >> 32, m.info & ((1L << 32) - 1), m.x[2]); kv_push(bwtintv_t, smem->mem, m); } } else ++x; } } PROF_END(tprof[T_SEED_3][tid], seed_3); -#endif #ifdef FILTER_FULL_MATCH collect_intv_end: #endif // sort ks_introsort(mem_intv, smem->mem.n, smem->mem.a); - - - khash_t(str) * h; - //for (i = 0; i < smem->mem.n; ++i) { - // fprintf(stderr, "seed %d: %ld %ld %ld\n", i, smem->mem.a[i].x[0], smem->mem.a[i].x[1], smem->mem.a[i].x[2]); - //} - //fprintf(stderr, "split\n"); } void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid) { - int i; + int i, j; if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid); @@ -1503,12 +1379,10 @@ void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t int step, count; // seed length int64_t k; if (p->num_match > 0) { - uint64_t pos = p->rm[0].rs; - if (p->rm[0].reverse) pos = (fmt->l_pac << 1) - 1 - pos; - kv_push(uint64_t, smemv->pos_arr, pos); - if (p->num_match > 1) { - uint64_t pos = p->rm[1].rs; - if (p->rm[1].reverse) pos = (fmt->l_pac << 1) - 1 - pos; + for (j = 0; j < p->num_match; ++j) + { + uint64_t pos = p->rm[j].rs; + if (p->rm[j].reverse) pos = (fmt->l_pac << 1) - 1 - pos; kv_push(uint64_t, smemv->pos_arr, pos); } } else { @@ -1575,14 +1449,11 @@ void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *b int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length int64_t k; if (p->num_match > 0) { - mem_seed_t s; - s.rbeg = smemv->pos_arr.a[j++]; - //fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info>>32, (uint32_t)p->info); - CHECK_ADD_CHAIN(tmp, lower, upper); - if (p->num_match > 1) { + + for (k = 0; k < p->num_match; ++k) + { mem_seed_t s; s.rbeg = smemv->pos_arr.a[j++]; - //fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info); CHECK_ADD_CHAIN(tmp, lower, upper); } } else { @@ -1740,36 +1611,6 @@ static void worker_smem_align(void *data, int i, int tid) mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid); } -static void worker_sam(void *data, int i, int tid) -{ - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid); - extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); - extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size); - mem_worker_t *w = (mem_worker_t*)data; - int start = i*w->opt->batch_size; - int end = MIN(start + w->opt->batch_size, w->n_reads); - - if (!(w->opt->flag&MEM_F_PE)) { - for (i=start; i= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); - if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); - mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]); - free(w->regs[i].a); - } - } else { - if (!(w->opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment - mem_matesw_batch(w->opt, w->bns, w->pac, w->pes, &w->seqs[start], &w->regs[start], end-start); - } - - for (i=start; i= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name); - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i], tid); - free(w->regs[i].a); free(w->regs[i+1].a); - } - } -} - void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams) { extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n); @@ -1801,7 +1642,6 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed PROF_END(gprof[G_MEM_PREPARE], mem_prepare); PROF_START(kernel); - // kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions PROF_END(gprof[G_MEM_KERNEL], kernel); @@ -1814,8 +1654,7 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed PROF_END(gprof[G_MEM_PESTAT], pestat); PROF_START(mem_sam); - kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - //kt_for(opt->n_threads, worker_sam, w, n_batch); + kt_for(opt->n_threads, worker_sam, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment PROF_END(gprof[G_MEM_SAM], mem_sam); //free(w.regs); diff --git a/bwt.c b/bwt.c index 7020456..4b44bb1 100644 --- a/bwt.c +++ b/bwt.c @@ -402,28 +402,28 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b } // 创建正向的kmer -inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed) +inline uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed) { - uint32_t qbit = 0, i; + uint64_t qbit = 0, i; qlen = qlen < kmer_len ? qlen : kmer_len; for (i = 0; i < qlen; ++i) { if (q[i] > 3) break; // 要考虑碱基是N - qbit |= q[i] << ((kmer_len - 1 - i) << 1); + qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1); } *base_consumed = i; return qbit; } // 创建f反向的kmer -inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed) +inline uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed) { - uint32_t qbit = 0; + uint64_t qbit = 0; int i, j, end_pos; end_pos = start_pos - kmer_len; end_pos = end_pos < 0 ? -1 : end_pos; for (i = start_pos, j = 0; i > end_pos; --i, ++j) { if (q[i] > 3) break; // 要考虑碱基是N - qbit |= q[i] << ((kmer_len - 1 - j) << 1); + qbit |= (uint64_t)q[i] << ((kmer_len - 1 - j) << 1); } *base_consumed = start_pos - i; return (~qbit) & ((1L << (kmer_len << 1)) - 1); diff --git a/bwt.h b/bwt.h index cb76989..036a71a 100644 --- a/bwt.h +++ b/bwt.h @@ -152,8 +152,8 @@ extern "C" { void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos); void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos); - uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed); - uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed); + uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed); + uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed); // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values void bwt_gen_cnt_table(bwt_t *bwt); diff --git a/fastmap.c b/fastmap.c index 0529960..dc32f68 100644 --- a/fastmap.c +++ b/fastmap.c @@ -656,7 +656,7 @@ int main_mem(int argc, char *argv[]) #ifdef SHOW_DATA_PERF fprintf(stderr, "\n"); - fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]); + // fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]); fprintf(stderr, "\n"); #endif diff --git a/fmt_idx.c b/fmt_idx.c index 3290853..55efc8a 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -837,8 +837,6 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int PUSH_VAL_AND_SKIP_FORWARD(ik); // 扩展kmer之后的碱基 - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); for (i = (int)ik.info; i + 1 < len; i += 2) { // forward search if (q[i] < 4 && q[i + 1] < 4) @@ -846,20 +844,11 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); + #if 1 if (min_intv == 1 && ok2.x[2] == min_intv) { -#ifdef DEBUG_FILE_OUTPUT -#ifdef COUNT_SEED_LENGTH - fprintf(gf[0], "%d\t", i + 2 - x); -#endif -#endif direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0); -#ifdef DEBUG_FILE_OUTPUT -#if 0 - fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); -#endif -#endif kv_push(bwtintv_t, *mem, mt); ret = (uint32_t)mt.info; goto fmt_smem_forward_end; @@ -939,7 +928,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置 ik.info = x + 1; - //int print_flag = 0; // check change of the interval size and whether the interval size is too small to be extended further #define CHECK_INTV_CHANGE(iv, ov, end_pos) \ if (ov.x[2] != iv.x[2]) \ @@ -958,22 +946,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv goto backward_search; \ } while (0) -#define DIRECT_EXTEND_2_SEED(idx0, idx1, qspos) \ - mt.rm[idx1].qs = MAX(mt.rm[idx0].qs, lm->rm[0].qs); \ - mt.rm[idx1].qe = MIN(mt.rm[idx0].qe, lm->rm[0].qe); \ - mt.rm[idx0].qs = mt.rm[idx1].qs; \ - mt.rm[idx0].qe = mt.rm[idx1].qe; \ - mt.rm[idx1].reverse = lm->rm[0].reverse; \ - left_ext = qspos - mt.rm[0].qs; \ - if (mt.rm[0].reverse) \ - mt.rm[0].rs = (fmt->l_pac << 1) - 1 - (s1 - left_ext); \ - else \ - mt.rm[0].rs = s1 - left_ext; \ - if (mt.rm[1].reverse) \ - mt.rm[1].rs = (fmt->l_pac << 1) - 1 - (s2 - left_ext); \ - else \ - mt.rm[1].rs = s2 - left_ext; - // 处理kmer对应的匹配信息 for (curr->n = 0, j = 1; j < kmer_len; ++j) { @@ -984,8 +956,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv PUSH_VAL_AND_SKIP(ik); // 扩展kmer之后的碱基 - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); for (i = (int)ik.info; i + 1 < len; i += 2) { // forward search if (q[i] < 4 && q[i + 1] < 4) @@ -1000,49 +970,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv if (min_intv == 1 && ok2.x[2] == min_intv) { direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0); - //fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len ret = (uint32_t)mt.info; if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; goto backward_search; } -#if 0 - // 判断只有两个候选的情况 - else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) { - bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); - bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1); - int left_ext; - bwtint_t lmrp = lm->rm[0].rs; - if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; - - if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集 - direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1); - DIRECT_EXTEND_2_SEED(1, 0, x); - } - else - { - direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0); - DIRECT_EXTEND_2_SEED(0, 1, x); - } - mt.info = mt.rm[0].qs; - mt.info = mt.info << 32 | mt.rm[0].qe; - mt.num_match = 2; - mt.x[2] = 2; - kv_push(bwtintv_t, *mem, mt); - ret = (uint32_t)mt.info; - - //fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); - //fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2); - //s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; - //s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; - //fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); - - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - goto backward_search; - } -#endif #endif } else if (q[i] < 4) // q[i+1] >= 4 @@ -1082,11 +1015,7 @@ backward_search: (intv).info |= (uint64_t)(pos) << 32; \ kv_push(bwtintv_t, *mem, (intv)); \ } - // if (flag) - //{ - // fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1)); - // flag = 0; - // } + #define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ if (ok.x[2] < min_intv) \ { \ @@ -1098,8 +1027,6 @@ backward_search: for (j = 0; j < curr->n; ++j) { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 - // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); if (p->info - x < HASH_KMER_LEN) { if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) @@ -1127,7 +1054,6 @@ backward_search: { if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 { - // fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); @@ -1136,77 +1062,9 @@ backward_search: CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); ok2.info = p->info; *p = ok2; -#if 0 - // 在这里进行判断是否只有一个候选了 - if (min_intv == 1 && ok2.x[2] == min_intv) - { - int qs = i - 1; - int qe = (int)ok2.info; - if (mem->n > 0) - { - int qsl = (int)(mem->a[mem->n - 1].info >> 32); - int qel = (int)(mem->a[mem->n - 1].info); - if (qsl <= qs && qel >= qe) - break; - } - direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0); - // fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); - kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - break; - } -#endif -#if 0 - if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len) - { - int qs = i - 1; - int qe = (int)ok2.info; - if (mem->n > 0) - { - int qsl = (int)(mem->a[mem->n - 1].info >> 32); - int qel = (int)(mem->a[mem->n - 1].info); - if (qsl <= qs && qel >= qe) - break; - } -#if 1 - int left_ext; - bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); - bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1); - - bwtint_t lmrp = lm->rm[0].rs; - if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; - - if (lmrp + qs == s1 + lm->rm[0].qs) - { // s1是lm的子集 - direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1); - DIRECT_EXTEND_2_SEED(1, 0, qs); - } - else - { - direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0); - DIRECT_EXTEND_2_SEED(0, 1, qs); - } - mt.info = mt.rm[0].qs; - mt.info = mt.info << 32 | mt.rm[0].qe; - mt.num_match = 2; - mt.x[2] = 2; - kv_push(bwtintv_t, *mem, mt); - // fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); - // fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2); - // s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; - // s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; - // fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - break; -#endif - } -#endif } else if (q[i] < 4) // 只能扩展一个 { - // fmt_extend1(fmt, p, &ok1, 1, q[i]); fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; @@ -1223,7 +1081,6 @@ backward_search: { // 扩展到了第一个碱基 if (q[i] < 4) { - // fmt_extend1(fmt, p, &ok1, 1, q[i]); fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; @@ -1243,316 +1100,6 @@ backward_search: } fmt_smem_end: - //if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n"); - fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate - return ret; -} - -// 找smem(seed)(lm: long_smem) -int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev) -{ - // int flag = 0; - int i, j, ret, kmer_len; - bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; - bwtintv_t mt = {0}; - bwtintv_v *curr; - uint32_t qbit = 0; - mem->n = 0; - curr = tmpvec; // use the temporary vector if provided - - qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); - bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置 - ik.info = x + 1; - //int print_flag = 0; -// check change of the interval size and whether the interval size is too small to be extended further -#define CHECK_INTV_CHANGE_2(iv, ov, end_pos) \ - if (ov.x[2] != iv.x[2]) \ - { \ - \ - kv_push(bwtintv_t, *curr, iv); \ - if (ov.x[2] < min_intv) \ - break; \ - } \ - iv = ov; \ - iv.info = end_pos - -#define PUSH_VAL_AND_SKIP_2(iv) \ - do \ - { \ - kv_push(bwtintv_t, *curr, iv); \ - goto backward_search; \ - } while (0) - - // 处理kmer对应的匹配信息 - for (curr->n = 0, j = 1; j < kmer_len; ++j) - { - bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j); - CHECK_INTV_CHANGE_2(ik, ok1, x + j + 1); - } - if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后 - PUSH_VAL_AND_SKIP_2(ik); - - // 扩展kmer之后的碱基 - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); - for (i = (int)ik.info; i + 1 < len; i += 2) - { // forward search - if (q[i] < 4 && q[i + 1] < 4) - { - fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); - CHECK_INTV_CHANGE_2(ik, ok1, i + 1); - CHECK_INTV_CHANGE_2(ik, ok2, i + 2); -#if 1 - // 在这里进行判断是否只有一个候选了 - if (min_intv == 1 && ok2.x[2] == min_intv) - { - direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0); - //fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); - kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len - ret = (uint32_t)mt.info; - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - goto backward_search; - } -#if 1 - // 判断只有两个候选的情况 - else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) { - bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); - bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1); - int left_ext; - bwtint_t lmrp = lm->rm[0].rs; - if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; - - if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集 - direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1); - DIRECT_EXTEND_2_SEED(1, 0, x); - } - else - { - direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0); - DIRECT_EXTEND_2_SEED(0, 1, x); - } - mt.info = mt.rm[0].qs; - mt.info = mt.info << 32 | mt.rm[0].qe; - mt.num_match = 2; - mt.x[2] = 2; - kv_push(bwtintv_t, *mem, mt); - ret = (uint32_t)mt.info; - - //fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); - //fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2); - //s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; - //s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; - //fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); - - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - goto backward_search; - } -#endif -#endif - } - else if (q[i] < 4) // q[i+1] >= 4 - { - fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); - CHECK_INTV_CHANGE_2(ik, ok1, i + 1); - PUSH_VAL_AND_SKIP_2(ik); - } - else // q[i] >= 4 - { - PUSH_VAL_AND_SKIP_2(ik); - } - } - for (; i == len - 1; ++i) // 扩展到了最后一个碱基 - { - if (q[i] < 4) - { - fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); - CHECK_INTV_CHANGE_2(ik, ok1, i + 1); - } - else - PUSH_VAL_AND_SKIP_2(ik); - } - if (i == len) - kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end - -backward_search: - fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first - if (mt.num_match == 0) - ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 - else - ret = (uint32_t)mt.info; - // 按照种子进行遍历,反向扩展 -#define CHECK_ADD_MEM(pos, intv, mem) \ - if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) \ - { \ - (intv).info |= (uint64_t)(pos) << 32; \ - kv_push(bwtintv_t, *mem, (intv)); \ - } - // if (flag) - //{ - // fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1)); - // flag = 0; - // } -#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ - if (ok.x[2] < min_intv) \ - { \ - CHECK_ADD_MEM(pos, intv, mem); \ - break; \ - } - - int last_kmer_start = 0; - for (j = 0; j < curr->n; ++j) - { - bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 - // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); - if (p->info - x < HASH_KMER_LEN) - { - if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) - qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer - else - qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer - last_kmer_start = p->info - 1; - i = 1; - do - { - bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); - } while (ik.x[2] < min_intv); - if (i > 2) - continue; - p->x[0] = ik.x[1]; - p->x[1] = ik.x[0]; - p->x[2] = ik.x[2]; - i = p->info - (kmer_len - i + 3); - } - else - { - i = x - 1; - } - for (; i > 0; i -= 2) - { - if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 - { - // fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); - fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); - CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); - ok1.info = p->info; - CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); - ok2.info = p->info; - *p = ok2; -#if 0 - // 在这里进行判断是否只有一个候选了 - if (min_intv == 1 && ok2.x[2] == min_intv) - { - int qs = i - 1; - int qe = (int)ok2.info; - if (mem->n > 0) - { - int qsl = (int)(mem->a[mem->n - 1].info >> 32); - int qel = (int)(mem->a[mem->n - 1].info); - if (qsl <= qs && qel >= qe) - break; - } - direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0); - // fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); - kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - break; - } -#endif -#if 0 - if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len) - { - int qs = i - 1; - int qe = (int)ok2.info; - if (mem->n > 0) - { - int qsl = (int)(mem->a[mem->n - 1].info >> 32); - int qel = (int)(mem->a[mem->n - 1].info); - if (qsl <= qs && qel >= qe) - break; - } -#if 1 - int left_ext; - bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); - bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1); - - bwtint_t lmrp = lm->rm[0].rs; - if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; - - if (lmrp + qs == s1 + lm->rm[0].qs) - { // s1是lm的子集 - direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1); - DIRECT_EXTEND_2_SEED(1, 0, qs); - } - else - { - direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0); - DIRECT_EXTEND_2_SEED(0, 1, qs); - } - mt.info = mt.rm[0].qs; - mt.info = mt.info << 32 | mt.rm[0].qe; - mt.num_match = 2; - mt.x[2] = 2; - kv_push(bwtintv_t, *mem, mt); - // fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); - // fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2); - // s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; - // s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; - // fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); - if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - goto fmt_smem_end; - break; -#endif - } -#endif - } - else if (q[i] < 4) // 只能扩展一个 - { - // fmt_extend1(fmt, p, &ok1, 1, q[i]); - fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); - CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); - ok1.info = p->info; - CHECK_ADD_MEM(i, ok1, mem); - goto fmt_smem_end; - } - else - { // 不能扩展 - CHECK_ADD_MEM(i + 1, *p, mem); - goto fmt_smem_end; - } - } - for (; i == 0; --i) - { // 扩展到了第一个碱基 - if (q[i] < 4) - { - // fmt_extend1(fmt, p, &ok1, 1, q[i]); - fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); - CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); - ok1.info = p->info; - *p = ok1; - } - else - { - CHECK_ADD_MEM(i + 1, *p, mem); - goto fmt_smem_end; - } - } - if (i == -1) - { - CHECK_ADD_MEM(i + 1, *p, mem); - goto fmt_smem_end; - } - } - -fmt_smem_end: - //if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n"); fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate return ret; } @@ -1572,9 +1119,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置 ik.info = x + kmer_len; - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); - // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); - #define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \ if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \ { \ @@ -1590,7 +1134,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in { if (q[i] < 4 && q[i + 1] < 4) { - // fmt_direct_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); @@ -1610,18 +1153,15 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in if (q[i] < 4 && q[i + 1] < 4) { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); - // fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len); ik = ok2; - //fprintf(stderr, "d: %d %ld\n", i, ok2.x[2]); } else if (q[i] < 4) // q[i+1] >= 4 { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); - //fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); return i + 2; } @@ -1633,7 +1173,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in if (i == len - 1 && q[i] < 4) { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); - //fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); } return len; @@ -1659,18 +1198,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_ *b1 = base2 & 3; } -// k, k1, k2都是bwt矩阵对应的行 -inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) -{ - uint8_t b1, b2; - bwtint_t tk[4], kk; - kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) - fmt_get_previous_base(fmt, kk, &b1, &b2); - fmt_e2_occ(fmt, k, b1, b2, tk); - *k1 = fmt->L2[b1] + tk[1]; - *k2 = fmt->L2[b2] + tk[3]; -} - inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) { uint8_t b1, b2; @@ -1741,7 +1268,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) { ++sa; fmt_previous_line(fmt, k, &k1, &k2); - //fmt_previous_line_old(fmt, k, &k1, &k2); __builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2); if (!(k1 & mask)) { diff --git a/profiling.c b/profiling.c index 223502e..0689c79 100644 --- a/profiling.c +++ b/profiling.c @@ -106,5 +106,6 @@ int display_stats(int nthreads) fprintf(stderr, "\n"); #endif + return 1; } \ No newline at end of file diff --git a/profiling.h b/profiling.h index 4ad868d..1ab4fcc 100644 --- a/profiling.h +++ b/profiling.h @@ -26,8 +26,8 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE]; #endif #ifdef SHOW_DATA_PERF -extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0}; -extern int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0}; +extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD]; +extern int64_t gdat[LIM_GLOBAL_DATA_TYPE]; #endif #ifdef SHOW_PERF @@ -81,7 +81,10 @@ enum T_BSW, T_BSW_ALL, T_SAM_MATESW, - T_SAM_REG2ALN + T_SAM_REG2ALN, + T_SEED_3_1, + T_SEED_3_2, + T_SEED_3_3 }; int display_stats(int); diff --git a/run.sh b/run.sh index dd64e64..594530e 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,8 @@ thread=1 -## d1 +## d1 k18<=4 89% +#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq +#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq #n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq #n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq @@ -9,22 +11,22 @@ n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq #n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq #n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz #n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz -## d2 +## d2 <= 4 87% #n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq #n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq #n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq #n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq -# d3 +# d3 <= 4 77% #n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq #n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq #n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz #n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz -## d4 +## d4 <= 4 93% #n_r1=~/data/fastq/dataset/zy_wes/s_1.fq #n_r2=~/data/fastq/dataset/zy_wes/s_2.fq #n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz #n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz -## d5 +## d5 <= 4 80% #n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz #n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz #n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq @@ -38,7 +40,7 @@ reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta #out=~/data1/out-u8-1.sam #out=~/data1/out-i16.sam -out=~/data1/out.sam +out=~/data1/fmt-out.sam #out=/dev/null time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ diff --git a/seed2.py b/seed2.py new file mode 100755 index 0000000..b990c89 --- /dev/null +++ b/seed2.py @@ -0,0 +1,15 @@ +#!/bin/env python3 +import sys +import os + +back_num = 0 +end_num = 0 +seed2_file = sys.argv[1] +with open(seed2_file, 'r') as f: + for line in f: + if line.startswith("back"): + back_num += int(line.split(' ')[1]) + elif line.startswith("end"): + end_num += int(line.split(' ')[1]) + +print(back_num, end_num, end_num / back_num) \ No newline at end of file