clean代码,结果正确

This commit is contained in:
zzh 2024-07-26 04:15:55 +08:00
parent 8c2a90e3e2
commit 9c9502d584
13 changed files with 69 additions and 676 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
output/
*.[oa]
*.txt
*.sam

6
.vscode/launch.json vendored
View File

@ -17,9 +17,9 @@
"-M",
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/bwa/human_g1k_v37_decoy.fasta",
"~/data/fastq/dataset/na12878_wes_144/s_1.fq",
"~/data/fastq/dataset/na12878_wes_144/s_2.fq",
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta",
"~/data/fastq/dataset/na12878_wes_144/1w_1.fq",
"~/data/fastq/dataset/na12878_wes_144/1w_2.fq",
"-o",
"/dev/null"
],

View File

@ -29,6 +29,12 @@
"scalar_sse.h": "c",
"immintrin.h": "c",
"ksw.h": "c",
"debug.h": "c"
"debug.h": "c",
"type_traits": "c",
"cstdint": "c",
"bitset": "c",
"iterator": "c",
"memory": "c",
"__locale": "c"
}
}

View File

@ -5,8 +5,8 @@ CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
FILTER_FULL_MATCH= -DFILTER_FULL_MATCH
SHOW_DATA_PERF= -DSHOW_DATA_PERF
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
USE_MT_READ= -DUSE_MT_READ
AR= ar

191
bwamem.c
View File

@ -142,7 +142,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len,
kv_push(bwtintv_t, a->mem, *p);
}
}
//break; // for test full match time
} else {
++x;
if (start_flag) ++start_N_num;
@ -202,7 +201,6 @@ KBTREE_INIT(chn, mem_chain_t, chain_cmp)
// return 1 if the seed is merged into the chain
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
{
//return 0;
int64_t qend, rend, x, y;
const mem_seed_t *last = &c->seeds[c->n-1];
qend = last->qbeg + last->len;
@ -340,30 +338,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
mem_seed_t s;
s.rbeg = fmt_sa(fmt, p->x[0] + k);
//kv_push(uint64_t, v1, s.rbeg);
//fprintf(stderr, "%ld\t", s.rbeg);
//if (p->x[0] == 0)
// s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + p->x[2] - 1 - k);
//else
// s.rbeg = fmt_sa(fmt, p->x[0] + k);
//kv_push(uint64_t, v2, s.rbeg);
//fprintf(stderr, "%ld\t", s.rbeg);
//s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + k);
//fprintf(stderr, "%ld\n", s.rbeg);
//if (s.rbeg < fmt->l_pac) { // 在正链
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
//} else { // 在反向互补链
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
//}
//s.rbeg = fmt_sa(fmt, p->x[0] + k);
CHECK_ADD_CHAIN(tmp, lower, upper);
}
//fprintf(stderr, "\n");
}
}
//fprintf(stderr, "split\n");
kv_resize(mem_chain_t, chain, kb_size(tree));
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
@ -374,28 +352,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
kb_destroy(chn, tree);
#if 0
for (i = 0; i < chain.n; ++i)
{
fprintf(gf[0], "chain-begin: %d\n", i);
int j;
fprintf(gf[0], "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid);
for (j = 0; j < chain.a[i].n; ++j) {
fprintf(gf[0], "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len);
}
fprintf(gf[0], "chain-end\n");
}
fprintf(gf[0], "fun-end\n");
#endif
// ks_introsort(uint64_sort, v1.n, v1.a);
// ks_introsort(uint64_sort, v2.n, v2.a);
// for (i = 0; i < v1.n; ++i) {
// //fprintf(stderr, "%ld\t%ld\n", v1.a[i], v2.a[i]);
// if (v1.a[i] != v2.a[i]) {
// fprintf(stderr, "diff: %ld\t%ld\n", v1.a[i], v2.a[i]);
// }
// }
return chain;
}
@ -1270,21 +1226,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
return a;
}
static void worker1(void *data, int i, int tid)
{
mem_worker_t *w = (mem_worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);
} else {
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);
}
}
static void worker2(void *data, int i, int tid)
static void worker_sam(void *data, int i, int tid)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
@ -1346,8 +1288,6 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b
}
return w;
}
#include "khash.h"
KHASH_MAP_INIT_INT64(seed, uint64_t);
static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid)
{
@ -1357,24 +1297,13 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
int max_seed_len = 0;
int start_N_num = 0, start_flag = 1;
smem->mem.n = 0;
//int s1_num = 0;
// 1. first pass: find all SMEMs
// goto third_seed;
PROF_START(seed_1);
// fprintf(gf[0], "read\n");
while (x < len) {
if (seq[x] < 4) {
start_flag = 0;
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_SEED_LENGTH
int last_x = x;
#endif
#endif
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_SEED_LENGTH
fprintf(gf[0], "%d\n", x - last_x);
#endif
#endif
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
@ -1383,82 +1312,38 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
kv_push(bwtintv_t, smem->mem, *p);
}
}
#ifdef COUNT_SEED_LENGTH
break; // for test full match time
#endif
// break; // for test
} else {
++x;
if (start_flag) ++start_N_num;
}
}
#ifdef SHOW_DATA_PERF
gdat[0] += 1; // read num ++
if (max_seed_len == len - start_N_num)
gdat[1] += 1; // seed-1 full match num ++
#endif
PROF_END(tprof[T_SEED_1][tid], seed_1);
//goto third_seed;
#ifdef FILTER_FULL_MATCH
if (max_seed_len == len - start_N_num) {
#ifdef DEBUG_FILE_OUTPUT
if (start_N_num == 0) {
#ifdef GET_FULL_MATCH_READ
for (i = 0; i < len; ++i)
fprintf(gf[0], "%c", "ACGT"[seq[i]]);
fprintf(gf[0], "\n");
#endif
#ifdef SHOW_DATA_PERF
gdat[2]++;
if (gdat[2] % 100 == 0) {
fprintf(stderr, "reads num: %ld\n", gdat[2]);
}
#endif
}
#endif
//goto third_seed;
goto collect_intv_end;
}
#endif
bwtintv_v mem2 = {0};
// 2. second pass: find MEMs inside a long SMEM
uint32_v qev;
PROF_START(seed_2);
old_n = smem->mem.n;
//fprintf(gf[2], "seed-1\n");
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &smem->mem.a[k];
int start = p->info>>32, end = (int32_t)p->info;
if (end - start < split_len || p->x[2] > opt->split_width) continue;
//++ s1_num;
//fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", p->info >> 32, (int)p->info, p->x[0], p->x[1], p->x[2]);
// fprintf(gf[0], "start\n");
fmt_smem_2(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0], qev);
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info >> 32);
// fprintf(gf[0], "%d\n", slen);
if (slen >= opt->min_seed_len) {
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
kv_push(bwtintv_t, mem2, a->mem1.a[i]);
}
}
// fprintf(gf[0], "end\n");
}
//fprintf(gf[2], "seed-2\n");
PROF_END(tprof[T_SEED_2][tid], seed_2);
//if (s1_num > 1) {
//ks_introsort(mem_intv, mem2.n, mem2.a);
//for (i = 0; i < mem2.n; ++i) {
// fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", mem2.a[i].info >> 32, (int)mem2.a[i].info, mem2.a[i].x[0], mem2.a[i].x[1], mem2.a[i].x[2]);
//}
//fprintf(gf[2], "\n");
//}
// third_seed:
#if 1
//third_seed:
// 3. third pass: LAST-like
PROF_START(seed_3);
if (opt->max_mem_intv > 0) {
@ -1468,32 +1353,23 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
bwtintv_t m;
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
if (m.x[2] > 0) {
// fprintf(stderr, "3: %ld %ld %ld\n", m.info >> 32, m.info & ((1L << 32) - 1), m.x[2]);
kv_push(bwtintv_t, smem->mem, m);
}
} else ++x;
}
}
PROF_END(tprof[T_SEED_3][tid], seed_3);
#endif
#ifdef FILTER_FULL_MATCH
collect_intv_end:
#endif
// sort
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
khash_t(str) * h;
//for (i = 0; i < smem->mem.n; ++i) {
// fprintf(stderr, "seed %d: %ld %ld %ld\n", i, smem->mem.a[i].x[0], smem->mem.a[i].x[1], smem->mem.a[i].x[2]);
//}
//fprintf(stderr, "split\n");
}
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)
{
int i;
int i, j;
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid);
@ -1503,12 +1379,10 @@ void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t
int step, count; // seed length
int64_t k;
if (p->num_match > 0) {
uint64_t pos = p->rm[0].rs;
if (p->rm[0].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
kv_push(uint64_t, smemv->pos_arr, pos);
if (p->num_match > 1) {
uint64_t pos = p->rm[1].rs;
if (p->rm[1].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
for (j = 0; j < p->num_match; ++j)
{
uint64_t pos = p->rm[j].rs;
if (p->rm[j].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
kv_push(uint64_t, smemv->pos_arr, pos);
}
} else {
@ -1575,14 +1449,11 @@ void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *b
int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length
int64_t k;
if (p->num_match > 0) {
mem_seed_t s;
s.rbeg = smemv->pos_arr.a[j++];
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info>>32, (uint32_t)p->info);
CHECK_ADD_CHAIN(tmp, lower, upper);
if (p->num_match > 1) {
for (k = 0; k < p->num_match; ++k)
{
mem_seed_t s;
s.rbeg = smemv->pos_arr.a[j++];
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info);
CHECK_ADD_CHAIN(tmp, lower, upper);
}
} else {
@ -1740,36 +1611,6 @@ static void worker_smem_align(void *data, int i, int tid)
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
}
static void worker_sam(void *data, int i, int tid)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size);
mem_worker_t *w = (mem_worker_t*)data;
int start = i*w->opt->batch_size;
int end = MIN(start + w->opt->batch_size, w->n_reads);
if (!(w->opt->flag&MEM_F_PE)) {
for (i=start; i<end; ++i) {
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]);
free(w->regs[i].a);
}
} else {
if (!(w->opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
mem_matesw_batch(w->opt, w->bns, w->pac, w->pes, &w->seqs[start], &w->regs[start], end-start);
}
for (i=start; i<end; i+=2) {
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name);
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i], tid);
free(w->regs[i].a); free(w->regs[i+1].a);
}
}
}
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
{
extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n);
@ -1801,7 +1642,6 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
PROF_END(gprof[G_MEM_PREPARE], mem_prepare);
PROF_START(kernel);
// kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
PROF_END(gprof[G_MEM_KERNEL], kernel);
@ -1814,8 +1654,7 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
PROF_END(gprof[G_MEM_PESTAT], pestat);
PROF_START(mem_sam);
kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
//kt_for(opt->n_threads, worker_sam, w, n_batch);
kt_for(opt->n_threads, worker_sam, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
PROF_END(gprof[G_MEM_SAM], mem_sam);
//free(w.regs);

12
bwt.c
View File

@ -402,28 +402,28 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
}
// 创建正向的kmer
inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
inline uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
{
uint32_t qbit = 0, i;
uint64_t qbit = 0, i;
qlen = qlen < kmer_len ? qlen : kmer_len;
for (i = 0; i < qlen; ++i) {
if (q[i] > 3) break; // 要考虑碱基是N
qbit |= q[i] << ((kmer_len - 1 - i) << 1);
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
}
*base_consumed = i;
return qbit;
}
// 创建f反向的kmer
inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
inline uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
{
uint32_t qbit = 0;
uint64_t qbit = 0;
int i, j, end_pos;
end_pos = start_pos - kmer_len;
end_pos = end_pos < 0 ? -1 : end_pos;
for (i = start_pos, j = 0; i > end_pos; --i, ++j) {
if (q[i] > 3) break; // 要考虑碱基是N
qbit |= q[i] << ((kmer_len - 1 - j) << 1);
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - j) << 1);
}
*base_consumed = start_pos - i;
return (~qbit) & ((1L << (kmer_len << 1)) - 1);

4
bwt.h
View File

@ -152,8 +152,8 @@ extern "C" {
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos);
uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
void bwt_gen_cnt_table(bwt_t *bwt);

View File

@ -656,7 +656,7 @@ int main_mem(int argc, char *argv[])
#ifdef SHOW_DATA_PERF
fprintf(stderr, "\n");
fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]);
// fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]);
fprintf(stderr, "\n");
#endif

478
fmt_idx.c
View File

@ -837,8 +837,6 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
PUSH_VAL_AND_SKIP_FORWARD(ik);
// 扩展kmer之后的碱基
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
@ -846,20 +844,11 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
#if 1
if (min_intv == 1 && ok2.x[2] == min_intv)
{
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_SEED_LENGTH
fprintf(gf[0], "%d\t", i + 2 - x);
#endif
#endif
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
#ifdef DEBUG_FILE_OUTPUT
#if 0
fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
#endif
#endif
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
goto fmt_smem_forward_end;
@ -939,7 +928,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
ik.info = x + 1;
//int print_flag = 0;
// check change of the interval size and whether the interval size is too small to be extended further
#define CHECK_INTV_CHANGE(iv, ov, end_pos) \
if (ov.x[2] != iv.x[2]) \
@ -958,22 +946,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
goto backward_search; \
} while (0)
#define DIRECT_EXTEND_2_SEED(idx0, idx1, qspos) \
mt.rm[idx1].qs = MAX(mt.rm[idx0].qs, lm->rm[0].qs); \
mt.rm[idx1].qe = MIN(mt.rm[idx0].qe, lm->rm[0].qe); \
mt.rm[idx0].qs = mt.rm[idx1].qs; \
mt.rm[idx0].qe = mt.rm[idx1].qe; \
mt.rm[idx1].reverse = lm->rm[0].reverse; \
left_ext = qspos - mt.rm[0].qs; \
if (mt.rm[0].reverse) \
mt.rm[0].rs = (fmt->l_pac << 1) - 1 - (s1 - left_ext); \
else \
mt.rm[0].rs = s1 - left_ext; \
if (mt.rm[1].reverse) \
mt.rm[1].rs = (fmt->l_pac << 1) - 1 - (s2 - left_ext); \
else \
mt.rm[1].rs = s2 - left_ext;
// 处理kmer对应的匹配信息
for (curr->n = 0, j = 1; j < kmer_len; ++j)
{
@ -984,8 +956,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
PUSH_VAL_AND_SKIP(ik);
// 扩展kmer之后的碱基
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
@ -1000,49 +970,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
if (min_intv == 1 && ok2.x[2] == min_intv)
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
//fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
ret = (uint32_t)mt.info;
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
goto backward_search;
}
#if 0
// 判断只有两个候选的情况
else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) {
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1);
int left_ext;
bwtint_t lmrp = lm->rm[0].rs;
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集
direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1);
DIRECT_EXTEND_2_SEED(1, 0, x);
}
else
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0);
DIRECT_EXTEND_2_SEED(0, 1, x);
}
mt.info = mt.rm[0].qs;
mt.info = mt.info << 32 | mt.rm[0].qe;
mt.num_match = 2;
mt.x[2] = 2;
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
//fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
//fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2);
//s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
//s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
//fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
goto backward_search;
}
#endif
#endif
}
else if (q[i] < 4) // q[i+1] >= 4
@ -1082,11 +1015,7 @@ backward_search:
(intv).info |= (uint64_t)(pos) << 32; \
kv_push(bwtintv_t, *mem, (intv)); \
}
// if (flag)
//{
// fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1));
// flag = 0;
// }
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
if (ok.x[2] < min_intv) \
{ \
@ -1098,8 +1027,6 @@ backward_search:
for (j = 0; j < curr->n; ++j)
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
if (p->info - x < HASH_KMER_LEN)
{
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
@ -1127,7 +1054,6 @@ backward_search:
{
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
{
// fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
@ -1136,77 +1062,9 @@ backward_search:
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
ok2.info = p->info;
*p = ok2;
#if 0
// 在这里进行判断是否只有一个候选了
if (min_intv == 1 && ok2.x[2] == min_intv)
{
int qs = i - 1;
int qe = (int)ok2.info;
if (mem->n > 0)
{
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
int qel = (int)(mem->a[mem->n - 1].info);
if (qsl <= qs && qel >= qe)
break;
}
direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
// fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
break;
}
#endif
#if 0
if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len)
{
int qs = i - 1;
int qe = (int)ok2.info;
if (mem->n > 0)
{
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
int qel = (int)(mem->a[mem->n - 1].info);
if (qsl <= qs && qel >= qe)
break;
}
#if 1
int left_ext;
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1);
bwtint_t lmrp = lm->rm[0].rs;
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
if (lmrp + qs == s1 + lm->rm[0].qs)
{ // s1是lm的子集
direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1);
DIRECT_EXTEND_2_SEED(1, 0, qs);
}
else
{
direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0);
DIRECT_EXTEND_2_SEED(0, 1, qs);
}
mt.info = mt.rm[0].qs;
mt.info = mt.info << 32 | mt.rm[0].qe;
mt.num_match = 2;
mt.x[2] = 2;
kv_push(bwtintv_t, *mem, mt);
// fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
// fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2);
// s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
// s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
// fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
break;
#endif
}
#endif
}
else if (q[i] < 4) // 只能扩展一个
{
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
@ -1223,7 +1081,6 @@ backward_search:
{ // 扩展到了第一个碱基
if (q[i] < 4)
{
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
@ -1243,316 +1100,6 @@ backward_search:
}
fmt_smem_end:
//if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n");
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
return ret;
}
// 找smemseed(lm: long_smem)
int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev)
{
// int flag = 0;
int i, j, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
bwtintv_v *curr;
uint32_t qbit = 0;
mem->n = 0;
curr = tmpvec; // use the temporary vector if provided
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
ik.info = x + 1;
//int print_flag = 0;
// check change of the interval size and whether the interval size is too small to be extended further
#define CHECK_INTV_CHANGE_2(iv, ov, end_pos) \
if (ov.x[2] != iv.x[2]) \
{ \
\
kv_push(bwtintv_t, *curr, iv); \
if (ov.x[2] < min_intv) \
break; \
} \
iv = ov; \
iv.info = end_pos
#define PUSH_VAL_AND_SKIP_2(iv) \
do \
{ \
kv_push(bwtintv_t, *curr, iv); \
goto backward_search; \
} while (0)
// 处理kmer对应的匹配信息
for (curr->n = 0, j = 1; j < kmer_len; ++j)
{
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
CHECK_INTV_CHANGE_2(ik, ok1, x + j + 1);
}
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
PUSH_VAL_AND_SKIP_2(ik);
// 扩展kmer之后的碱基
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
CHECK_INTV_CHANGE_2(ik, ok2, i + 2);
#if 1
// 在这里进行判断是否只有一个候选了
if (min_intv == 1 && ok2.x[2] == min_intv)
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
//fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
ret = (uint32_t)mt.info;
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
goto backward_search;
}
#if 1
// 判断只有两个候选的情况
else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) {
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1);
int left_ext;
bwtint_t lmrp = lm->rm[0].rs;
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集
direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1);
DIRECT_EXTEND_2_SEED(1, 0, x);
}
else
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0);
DIRECT_EXTEND_2_SEED(0, 1, x);
}
mt.info = mt.rm[0].qs;
mt.info = mt.info << 32 | mt.rm[0].qe;
mt.num_match = 2;
mt.x[2] = 2;
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
//fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
//fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2);
//s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
//s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
//fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
goto backward_search;
}
#endif
#endif
}
else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
PUSH_VAL_AND_SKIP_2(ik);
}
else // q[i] >= 4
{
PUSH_VAL_AND_SKIP_2(ik);
}
}
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
{
if (q[i] < 4)
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
}
else
PUSH_VAL_AND_SKIP_2(ik);
}
if (i == len)
kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
backward_search:
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
if (mt.num_match == 0)
ret = curr->a[0].info; // this will be the returned value扩展到的最远的位置
else
ret = (uint32_t)mt.info;
// 按照种子进行遍历,反向扩展
#define CHECK_ADD_MEM(pos, intv, mem) \
if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) \
{ \
(intv).info |= (uint64_t)(pos) << 32; \
kv_push(bwtintv_t, *mem, (intv)); \
}
// if (flag)
//{
// fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1));
// flag = 0;
// }
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
if (ok.x[2] < min_intv) \
{ \
CHECK_ADD_MEM(pos, intv, mem); \
break; \
}
int last_kmer_start = 0;
for (j = 0; j < curr->n; ++j)
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
if (p->info - x < HASH_KMER_LEN)
{
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
else
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
last_kmer_start = p->info - 1;
i = 1;
do
{
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++);
} while (ik.x[2] < min_intv);
if (i > 2)
continue;
p->x[0] = ik.x[1];
p->x[1] = ik.x[0];
p->x[2] = ik.x[2];
i = p->info - (kmer_len - i + 3);
}
else
{
i = x - 1;
}
for (; i > 0; i -= 2)
{
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
{
// fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
ok2.info = p->info;
*p = ok2;
#if 0
// 在这里进行判断是否只有一个候选了
if (min_intv == 1 && ok2.x[2] == min_intv)
{
int qs = i - 1;
int qe = (int)ok2.info;
if (mem->n > 0)
{
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
int qel = (int)(mem->a[mem->n - 1].info);
if (qsl <= qs && qel >= qe)
break;
}
direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
// fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
break;
}
#endif
#if 0
if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len)
{
int qs = i - 1;
int qe = (int)ok2.info;
if (mem->n > 0)
{
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
int qel = (int)(mem->a[mem->n - 1].info);
if (qsl <= qs && qel >= qe)
break;
}
#if 1
int left_ext;
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1);
bwtint_t lmrp = lm->rm[0].rs;
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
if (lmrp + qs == s1 + lm->rm[0].qs)
{ // s1是lm的子集
direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1);
DIRECT_EXTEND_2_SEED(1, 0, qs);
}
else
{
direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0);
DIRECT_EXTEND_2_SEED(0, 1, qs);
}
mt.info = mt.rm[0].qs;
mt.info = mt.info << 32 | mt.rm[0].qe;
mt.num_match = 2;
mt.x[2] = 2;
kv_push(bwtintv_t, *mem, mt);
// fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
// fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2);
// s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
// s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
// fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
goto fmt_smem_end;
break;
#endif
}
#endif
}
else if (q[i] < 4) // 只能扩展一个
{
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
CHECK_ADD_MEM(i, ok1, mem);
goto fmt_smem_end;
}
else
{ // 不能扩展
CHECK_ADD_MEM(i + 1, *p, mem);
goto fmt_smem_end;
}
}
for (; i == 0; --i)
{ // 扩展到了第一个碱基
if (q[i] < 4)
{
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
*p = ok1;
}
else
{
CHECK_ADD_MEM(i + 1, *p, mem);
goto fmt_smem_end;
}
}
if (i == -1)
{
CHECK_ADD_MEM(i + 1, *p, mem);
goto fmt_smem_end;
}
}
fmt_smem_end:
//if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n");
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
return ret;
}
@ -1572,9 +1119,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置
ik.info = x + kmer_len;
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
{ \
@ -1590,7 +1134,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
{
if (q[i] < 4 && q[i + 1] < 4)
{
// fmt_direct_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
@ -1610,18 +1153,15 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
// fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
ik = ok2;
//fprintf(stderr, "d: %d %ld\n", i, ok2.x[2]);
}
else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
//fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
return i + 2;
}
@ -1633,7 +1173,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
if (i == len - 1 && q[i] < 4)
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
//fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
}
return len;
@ -1659,18 +1198,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
*b1 = base2 & 3;
}
// k, k1, k2都是bwt矩阵对应的行
inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
bwtint_t tk[4], kk;
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
fmt_get_previous_base(fmt, kk, &b1, &b2);
fmt_e2_occ(fmt, k, b1, b2, tk);
*k1 = fmt->L2[b1] + tk[1];
*k2 = fmt->L2[b2] + tk[3];
}
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
@ -1741,7 +1268,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{
++sa;
fmt_previous_line(fmt, k, &k1, &k2);
//fmt_previous_line_old(fmt, k, &k1, &k2);
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
if (!(k1 & mask))
{

View File

@ -106,5 +106,6 @@ int display_stats(int nthreads)
fprintf(stderr, "\n");
#endif
return 1;
}

View File

@ -26,8 +26,8 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
#endif
#ifdef SHOW_DATA_PERF
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
#endif
#ifdef SHOW_PERF
@ -81,7 +81,10 @@ enum
T_BSW,
T_BSW_ALL,
T_SAM_MATESW,
T_SAM_REG2ALN
T_SAM_REG2ALN,
T_SEED_3_1,
T_SEED_3_2,
T_SEED_3_3
};
int display_stats(int);

14
run.sh
View File

@ -1,6 +1,8 @@
thread=1
## d1
## d1 k18<=4 89%
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
@ -9,22 +11,22 @@ n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
## d2
## d2 <= 4 87%
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
# d3
# d3 <= 4 77%
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
## d4
## d4 <= 4 93%
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
## d5
## d5 <= 4 80%
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
@ -38,7 +40,7 @@ reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
#out=~/data1/out-u8-1.sam
#out=~/data1/out-i16.sam
out=~/data1/out.sam
out=~/data1/fmt-out.sam
#out=/dev/null
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \

15
seed2.py 100755
View File

@ -0,0 +1,15 @@
#!/bin/env python3
import sys
import os
back_num = 0
end_num = 0
seed2_file = sys.argv[1]
with open(seed2_file, 'r') as f:
for line in f:
if line.startswith("back"):
back_num += int(line.split(' ')[1])
elif line.startswith("end"):
end_num += int(line.split(' ')[1])
print(back_num, end_num, end_num / back_num)