clean代码,结果正确
This commit is contained in:
parent
8c2a90e3e2
commit
9c9502d584
|
|
@ -1,3 +1,4 @@
|
|||
output/
|
||||
*.[oa]
|
||||
*.txt
|
||||
*.sam
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@
|
|||
"-M",
|
||||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"~/reference/bwa/human_g1k_v37_decoy.fasta",
|
||||
"~/data/fastq/dataset/na12878_wes_144/s_1.fq",
|
||||
"~/data/fastq/dataset/na12878_wes_144/s_2.fq",
|
||||
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta",
|
||||
"~/data/fastq/dataset/na12878_wes_144/1w_1.fq",
|
||||
"~/data/fastq/dataset/na12878_wes_144/1w_2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
],
|
||||
|
|
|
|||
|
|
@ -29,6 +29,12 @@
|
|||
"scalar_sse.h": "c",
|
||||
"immintrin.h": "c",
|
||||
"ksw.h": "c",
|
||||
"debug.h": "c"
|
||||
"debug.h": "c",
|
||||
"type_traits": "c",
|
||||
"cstdint": "c",
|
||||
"bitset": "c",
|
||||
"iterator": "c",
|
||||
"memory": "c",
|
||||
"__locale": "c"
|
||||
}
|
||||
}
|
||||
4
Makefile
4
Makefile
|
|
@ -5,8 +5,8 @@ CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
|
|||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
|
||||
SHOW_PERF= -DSHOW_PERF
|
||||
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
|
||||
FILTER_FULL_MATCH= -DFILTER_FULL_MATCH
|
||||
SHOW_DATA_PERF= -DSHOW_DATA_PERF
|
||||
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
||||
USE_MT_READ= -DUSE_MT_READ
|
||||
|
||||
AR= ar
|
||||
|
|
|
|||
191
bwamem.c
191
bwamem.c
|
|
@ -142,7 +142,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len,
|
|||
kv_push(bwtintv_t, a->mem, *p);
|
||||
}
|
||||
}
|
||||
//break; // for test full match time
|
||||
} else {
|
||||
++x;
|
||||
if (start_flag) ++start_N_num;
|
||||
|
|
@ -202,7 +201,6 @@ KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
|||
// return 1 if the seed is merged into the chain
|
||||
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
|
||||
{
|
||||
//return 0;
|
||||
int64_t qend, rend, x, y;
|
||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||
qend = last->qbeg + last->len;
|
||||
|
|
@ -340,30 +338,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
|
|||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
|
||||
mem_seed_t s;
|
||||
s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||||
|
||||
//kv_push(uint64_t, v1, s.rbeg);
|
||||
//fprintf(stderr, "%ld\t", s.rbeg);
|
||||
//if (p->x[0] == 0)
|
||||
// s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + p->x[2] - 1 - k);
|
||||
//else
|
||||
// s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||||
//kv_push(uint64_t, v2, s.rbeg);
|
||||
//fprintf(stderr, "%ld\t", s.rbeg);
|
||||
//s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + k);
|
||||
//fprintf(stderr, "%ld\n", s.rbeg);
|
||||
//if (s.rbeg < fmt->l_pac) { // 在正链
|
||||
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
|
||||
//} else { // 在反向互补链
|
||||
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
|
||||
//}
|
||||
//s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||||
|
||||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||||
}
|
||||
//fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
//fprintf(stderr, "split\n");
|
||||
kv_resize(mem_chain_t, chain, kb_size(tree));
|
||||
|
||||
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
|
||||
|
|
@ -374,28 +352,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
|
|||
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
||||
kb_destroy(chn, tree);
|
||||
|
||||
#if 0
|
||||
for (i = 0; i < chain.n; ++i)
|
||||
{
|
||||
fprintf(gf[0], "chain-begin: %d\n", i);
|
||||
int j;
|
||||
fprintf(gf[0], "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid);
|
||||
for (j = 0; j < chain.a[i].n; ++j) {
|
||||
fprintf(gf[0], "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len);
|
||||
}
|
||||
fprintf(gf[0], "chain-end\n");
|
||||
}
|
||||
fprintf(gf[0], "fun-end\n");
|
||||
#endif
|
||||
// ks_introsort(uint64_sort, v1.n, v1.a);
|
||||
// ks_introsort(uint64_sort, v2.n, v2.a);
|
||||
// for (i = 0; i < v1.n; ++i) {
|
||||
// //fprintf(stderr, "%ld\t%ld\n", v1.a[i], v2.a[i]);
|
||||
// if (v1.a[i] != v2.a[i]) {
|
||||
// fprintf(stderr, "diff: %ld\t%ld\n", v1.a[i], v2.a[i]);
|
||||
// }
|
||||
// }
|
||||
|
||||
return chain;
|
||||
}
|
||||
|
||||
|
|
@ -1270,21 +1226,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
|||
return a;
|
||||
}
|
||||
|
||||
static void worker1(void *data, int i, int tid)
|
||||
{
|
||||
mem_worker_t *w = (mem_worker_t*)data;
|
||||
if (!(w->opt->flag&MEM_F_PE)) {
|
||||
if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
|
||||
w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);
|
||||
} else {
|
||||
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
|
||||
w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);
|
||||
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
|
||||
w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
static void worker2(void *data, int i, int tid)
|
||||
static void worker_sam(void *data, int i, int tid)
|
||||
{
|
||||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
||||
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
||||
|
|
@ -1346,8 +1288,6 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b
|
|||
}
|
||||
return w;
|
||||
}
|
||||
#include "khash.h"
|
||||
KHASH_MAP_INIT_INT64(seed, uint64_t);
|
||||
|
||||
static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid)
|
||||
{
|
||||
|
|
@ -1357,24 +1297,13 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
int max_seed_len = 0;
|
||||
int start_N_num = 0, start_flag = 1;
|
||||
smem->mem.n = 0;
|
||||
//int s1_num = 0;
|
||||
// 1. first pass: find all SMEMs
|
||||
// goto third_seed;
|
||||
PROF_START(seed_1);
|
||||
// fprintf(gf[0], "read\n");
|
||||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
start_flag = 0;
|
||||
#ifdef DEBUG_FILE_OUTPUT
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
int last_x = x;
|
||||
#endif
|
||||
#endif
|
||||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||||
#ifdef DEBUG_FILE_OUTPUT
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
fprintf(gf[0], "%d\n", x - last_x);
|
||||
#endif
|
||||
#endif
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
|
|
@ -1383,82 +1312,38 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
kv_push(bwtintv_t, smem->mem, *p);
|
||||
}
|
||||
}
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
break; // for test full match time
|
||||
#endif
|
||||
// break; // for test
|
||||
} else {
|
||||
++x;
|
||||
if (start_flag) ++start_N_num;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SHOW_DATA_PERF
|
||||
gdat[0] += 1; // read num ++
|
||||
if (max_seed_len == len - start_N_num)
|
||||
gdat[1] += 1; // seed-1 full match num ++
|
||||
#endif
|
||||
|
||||
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
||||
|
||||
//goto third_seed;
|
||||
#ifdef FILTER_FULL_MATCH
|
||||
if (max_seed_len == len - start_N_num) {
|
||||
|
||||
#ifdef DEBUG_FILE_OUTPUT
|
||||
if (start_N_num == 0) {
|
||||
#ifdef GET_FULL_MATCH_READ
|
||||
for (i = 0; i < len; ++i)
|
||||
fprintf(gf[0], "%c", "ACGT"[seq[i]]);
|
||||
fprintf(gf[0], "\n");
|
||||
#endif
|
||||
#ifdef SHOW_DATA_PERF
|
||||
gdat[2]++;
|
||||
if (gdat[2] % 100 == 0) {
|
||||
fprintf(stderr, "reads num: %ld\n", gdat[2]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
//goto third_seed;
|
||||
goto collect_intv_end;
|
||||
}
|
||||
#endif
|
||||
bwtintv_v mem2 = {0};
|
||||
// 2. second pass: find MEMs inside a long SMEM
|
||||
uint32_v qev;
|
||||
PROF_START(seed_2);
|
||||
old_n = smem->mem.n;
|
||||
//fprintf(gf[2], "seed-1\n");
|
||||
for (k = 0; k < old_n; ++k) {
|
||||
bwtintv_t *p = &smem->mem.a[k];
|
||||
int start = p->info>>32, end = (int32_t)p->info;
|
||||
if (end - start < split_len || p->x[2] > opt->split_width) continue;
|
||||
//++ s1_num;
|
||||
//fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", p->info >> 32, (int)p->info, p->x[0], p->x[1], p->x[2]);
|
||||
// fprintf(gf[0], "start\n");
|
||||
fmt_smem_2(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0], qev);
|
||||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
// fprintf(gf[0], "%d\n", slen);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
|
||||
kv_push(bwtintv_t, mem2, a->mem1.a[i]);
|
||||
}
|
||||
}
|
||||
// fprintf(gf[0], "end\n");
|
||||
}
|
||||
//fprintf(gf[2], "seed-2\n");
|
||||
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
||||
//if (s1_num > 1) {
|
||||
//ks_introsort(mem_intv, mem2.n, mem2.a);
|
||||
//for (i = 0; i < mem2.n; ++i) {
|
||||
// fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", mem2.a[i].info >> 32, (int)mem2.a[i].info, mem2.a[i].x[0], mem2.a[i].x[1], mem2.a[i].x[2]);
|
||||
//}
|
||||
//fprintf(gf[2], "\n");
|
||||
//}
|
||||
|
||||
// third_seed:
|
||||
#if 1
|
||||
//third_seed:
|
||||
// 3. third pass: LAST-like
|
||||
PROF_START(seed_3);
|
||||
if (opt->max_mem_intv > 0) {
|
||||
|
|
@ -1468,32 +1353,23 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
bwtintv_t m;
|
||||
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||
if (m.x[2] > 0) {
|
||||
// fprintf(stderr, "3: %ld %ld %ld\n", m.info >> 32, m.info & ((1L << 32) - 1), m.x[2]);
|
||||
kv_push(bwtintv_t, smem->mem, m);
|
||||
}
|
||||
} else ++x;
|
||||
}
|
||||
}
|
||||
PROF_END(tprof[T_SEED_3][tid], seed_3);
|
||||
#endif
|
||||
|
||||
#ifdef FILTER_FULL_MATCH
|
||||
collect_intv_end:
|
||||
#endif
|
||||
// sort
|
||||
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
|
||||
|
||||
|
||||
khash_t(str) * h;
|
||||
//for (i = 0; i < smem->mem.n; ++i) {
|
||||
// fprintf(stderr, "seed %d: %ld %ld %ld\n", i, smem->mem.a[i].x[0], smem->mem.a[i].x[1], smem->mem.a[i].x[2]);
|
||||
//}
|
||||
//fprintf(stderr, "split\n");
|
||||
}
|
||||
|
||||
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)
|
||||
{
|
||||
int i;
|
||||
int i, j;
|
||||
|
||||
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
||||
mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid);
|
||||
|
|
@ -1503,12 +1379,10 @@ void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t
|
|||
int step, count; // seed length
|
||||
int64_t k;
|
||||
if (p->num_match > 0) {
|
||||
uint64_t pos = p->rm[0].rs;
|
||||
if (p->rm[0].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
||||
kv_push(uint64_t, smemv->pos_arr, pos);
|
||||
if (p->num_match > 1) {
|
||||
uint64_t pos = p->rm[1].rs;
|
||||
if (p->rm[1].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
||||
for (j = 0; j < p->num_match; ++j)
|
||||
{
|
||||
uint64_t pos = p->rm[j].rs;
|
||||
if (p->rm[j].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
||||
kv_push(uint64_t, smemv->pos_arr, pos);
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1575,14 +1449,11 @@ void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *b
|
|||
int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
int64_t k;
|
||||
if (p->num_match > 0) {
|
||||
mem_seed_t s;
|
||||
s.rbeg = smemv->pos_arr.a[j++];
|
||||
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info>>32, (uint32_t)p->info);
|
||||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||||
if (p->num_match > 1) {
|
||||
|
||||
for (k = 0; k < p->num_match; ++k)
|
||||
{
|
||||
mem_seed_t s;
|
||||
s.rbeg = smemv->pos_arr.a[j++];
|
||||
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info);
|
||||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1740,36 +1611,6 @@ static void worker_smem_align(void *data, int i, int tid)
|
|||
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
||||
}
|
||||
|
||||
static void worker_sam(void *data, int i, int tid)
|
||||
{
|
||||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
||||
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
||||
extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size);
|
||||
mem_worker_t *w = (mem_worker_t*)data;
|
||||
int start = i*w->opt->batch_size;
|
||||
int end = MIN(start + w->opt->batch_size, w->n_reads);
|
||||
|
||||
if (!(w->opt->flag&MEM_F_PE)) {
|
||||
for (i=start; i<end; ++i) {
|
||||
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
|
||||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
|
||||
if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
|
||||
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]);
|
||||
free(w->regs[i].a);
|
||||
}
|
||||
} else {
|
||||
if (!(w->opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
|
||||
mem_matesw_batch(w->opt, w->bns, w->pac, w->pes, &w->seqs[start], &w->regs[start], end-start);
|
||||
}
|
||||
|
||||
for (i=start; i<end; i+=2) {
|
||||
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name);
|
||||
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i], tid);
|
||||
free(w->regs[i].a); free(w->regs[i+1].a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
|
||||
{
|
||||
extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n);
|
||||
|
|
@ -1801,7 +1642,6 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
|
|||
PROF_END(gprof[G_MEM_PREPARE], mem_prepare);
|
||||
|
||||
PROF_START(kernel);
|
||||
// kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
|
||||
kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
|
||||
PROF_END(gprof[G_MEM_KERNEL], kernel);
|
||||
|
||||
|
|
@ -1814,8 +1654,7 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
|
|||
PROF_END(gprof[G_MEM_PESTAT], pestat);
|
||||
|
||||
PROF_START(mem_sam);
|
||||
kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
|
||||
//kt_for(opt->n_threads, worker_sam, w, n_batch);
|
||||
kt_for(opt->n_threads, worker_sam, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
|
||||
PROF_END(gprof[G_MEM_SAM], mem_sam);
|
||||
|
||||
//free(w.regs);
|
||||
|
|
|
|||
12
bwt.c
12
bwt.c
|
|
@ -402,28 +402,28 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
|
|||
}
|
||||
|
||||
// 创建正向的kmer
|
||||
inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
||||
inline uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint32_t qbit = 0, i;
|
||||
uint64_t qbit = 0, i;
|
||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||
for (i = 0; i < qlen; ++i) {
|
||||
if (q[i] > 3) break; // 要考虑碱基是N
|
||||
qbit |= q[i] << ((kmer_len - 1 - i) << 1);
|
||||
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
|
||||
}
|
||||
*base_consumed = i;
|
||||
return qbit;
|
||||
}
|
||||
|
||||
// 创建f反向的kmer
|
||||
inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
||||
inline uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint32_t qbit = 0;
|
||||
uint64_t qbit = 0;
|
||||
int i, j, end_pos;
|
||||
end_pos = start_pos - kmer_len;
|
||||
end_pos = end_pos < 0 ? -1 : end_pos;
|
||||
for (i = start_pos, j = 0; i > end_pos; --i, ++j) {
|
||||
if (q[i] > 3) break; // 要考虑碱基是N
|
||||
qbit |= q[i] << ((kmer_len - 1 - j) << 1);
|
||||
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - j) << 1);
|
||||
}
|
||||
*base_consumed = start_pos - i;
|
||||
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
||||
|
|
|
|||
4
bwt.h
4
bwt.h
|
|
@ -152,8 +152,8 @@ extern "C" {
|
|||
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
|
||||
void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos);
|
||||
|
||||
uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
|
||||
uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
|
||||
uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
|
||||
uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
|
||||
|
||||
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
|
||||
void bwt_gen_cnt_table(bwt_t *bwt);
|
||||
|
|
|
|||
|
|
@ -656,7 +656,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
#ifdef SHOW_DATA_PERF
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]);
|
||||
// fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]);
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
|
|
|
|||
478
fmt_idx.c
478
fmt_idx.c
|
|
@ -837,8 +837,6 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
|
|||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
|
|
@ -846,20 +844,11 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
|
|||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
|
||||
#if 1
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
#ifdef DEBUG_FILE_OUTPUT
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
fprintf(gf[0], "%d\t", i + 2 - x);
|
||||
#endif
|
||||
#endif
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
||||
#ifdef DEBUG_FILE_OUTPUT
|
||||
#if 0
|
||||
fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
#endif
|
||||
#endif
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
goto fmt_smem_forward_end;
|
||||
|
|
@ -939,7 +928,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
|
||||
ik.info = x + 1;
|
||||
//int print_flag = 0;
|
||||
// check change of the interval size and whether the interval size is too small to be extended further
|
||||
#define CHECK_INTV_CHANGE(iv, ov, end_pos) \
|
||||
if (ov.x[2] != iv.x[2]) \
|
||||
|
|
@ -958,22 +946,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
goto backward_search; \
|
||||
} while (0)
|
||||
|
||||
#define DIRECT_EXTEND_2_SEED(idx0, idx1, qspos) \
|
||||
mt.rm[idx1].qs = MAX(mt.rm[idx0].qs, lm->rm[0].qs); \
|
||||
mt.rm[idx1].qe = MIN(mt.rm[idx0].qe, lm->rm[0].qe); \
|
||||
mt.rm[idx0].qs = mt.rm[idx1].qs; \
|
||||
mt.rm[idx0].qe = mt.rm[idx1].qe; \
|
||||
mt.rm[idx1].reverse = lm->rm[0].reverse; \
|
||||
left_ext = qspos - mt.rm[0].qs; \
|
||||
if (mt.rm[0].reverse) \
|
||||
mt.rm[0].rs = (fmt->l_pac << 1) - 1 - (s1 - left_ext); \
|
||||
else \
|
||||
mt.rm[0].rs = s1 - left_ext; \
|
||||
if (mt.rm[1].reverse) \
|
||||
mt.rm[1].rs = (fmt->l_pac << 1) - 1 - (s2 - left_ext); \
|
||||
else \
|
||||
mt.rm[1].rs = s2 - left_ext;
|
||||
|
||||
// 处理kmer对应的匹配信息
|
||||
for (curr->n = 0, j = 1; j < kmer_len; ++j)
|
||||
{
|
||||
|
|
@ -984,8 +956,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
PUSH_VAL_AND_SKIP(ik);
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
|
|
@ -1000,49 +970,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
|
||||
//fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
||||
ret = (uint32_t)mt.info;
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
goto backward_search;
|
||||
}
|
||||
#if 0
|
||||
// 判断只有两个候选的情况
|
||||
else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) {
|
||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1);
|
||||
int left_ext;
|
||||
bwtint_t lmrp = lm->rm[0].rs;
|
||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
||||
|
||||
if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1);
|
||||
DIRECT_EXTEND_2_SEED(1, 0, x);
|
||||
}
|
||||
else
|
||||
{
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0);
|
||||
DIRECT_EXTEND_2_SEED(0, 1, x);
|
||||
}
|
||||
mt.info = mt.rm[0].qs;
|
||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
||||
mt.num_match = 2;
|
||||
mt.x[2] = 2;
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
|
||||
//fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
||||
//fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2);
|
||||
//s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
||||
//s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
||||
//fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
||||
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
goto backward_search;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
else if (q[i] < 4) // q[i+1] >= 4
|
||||
|
|
@ -1082,11 +1015,7 @@ backward_search:
|
|||
(intv).info |= (uint64_t)(pos) << 32; \
|
||||
kv_push(bwtintv_t, *mem, (intv)); \
|
||||
}
|
||||
// if (flag)
|
||||
//{
|
||||
// fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1));
|
||||
// flag = 0;
|
||||
// }
|
||||
|
||||
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
|
||||
if (ok.x[2] < min_intv) \
|
||||
{ \
|
||||
|
|
@ -1098,8 +1027,6 @@ backward_search:
|
|||
for (j = 0; j < curr->n; ++j)
|
||||
{
|
||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
||||
if (p->info - x < HASH_KMER_LEN)
|
||||
{
|
||||
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
||||
|
|
@ -1127,7 +1054,6 @@ backward_search:
|
|||
{
|
||||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
// fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||
|
|
@ -1136,77 +1062,9 @@ backward_search:
|
|||
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
||||
ok2.info = p->info;
|
||||
*p = ok2;
|
||||
#if 0
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
int qs = i - 1;
|
||||
int qe = (int)ok2.info;
|
||||
if (mem->n > 0)
|
||||
{
|
||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
||||
int qel = (int)(mem->a[mem->n - 1].info);
|
||||
if (qsl <= qs && qel >= qe)
|
||||
break;
|
||||
}
|
||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
||||
// fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len)
|
||||
{
|
||||
int qs = i - 1;
|
||||
int qe = (int)ok2.info;
|
||||
if (mem->n > 0)
|
||||
{
|
||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
||||
int qel = (int)(mem->a[mem->n - 1].info);
|
||||
if (qsl <= qs && qel >= qe)
|
||||
break;
|
||||
}
|
||||
#if 1
|
||||
int left_ext;
|
||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1);
|
||||
|
||||
bwtint_t lmrp = lm->rm[0].rs;
|
||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
||||
|
||||
if (lmrp + qs == s1 + lm->rm[0].qs)
|
||||
{ // s1是lm的子集
|
||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1);
|
||||
DIRECT_EXTEND_2_SEED(1, 0, qs);
|
||||
}
|
||||
else
|
||||
{
|
||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0);
|
||||
DIRECT_EXTEND_2_SEED(0, 1, qs);
|
||||
}
|
||||
mt.info = mt.rm[0].qs;
|
||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
||||
mt.num_match = 2;
|
||||
mt.x[2] = 2;
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
// fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
||||
// fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2);
|
||||
// s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
||||
// s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
||||
// fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (q[i] < 4) // 只能扩展一个
|
||||
{
|
||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
|
|
@ -1223,7 +1081,6 @@ backward_search:
|
|||
{ // 扩展到了第一个碱基
|
||||
if (q[i] < 4)
|
||||
{
|
||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
|
|
@ -1243,316 +1100,6 @@ backward_search:
|
|||
}
|
||||
|
||||
fmt_smem_end:
|
||||
//if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n");
|
||||
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||
return ret;
|
||||
}
|
||||
|
||||
// 找smem(seed)(lm: long_smem)
|
||||
int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev)
|
||||
{
|
||||
// int flag = 0;
|
||||
int i, j, ret, kmer_len;
|
||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
||||
bwtintv_t mt = {0};
|
||||
bwtintv_v *curr;
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
curr = tmpvec; // use the temporary vector if provided
|
||||
|
||||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
|
||||
ik.info = x + 1;
|
||||
//int print_flag = 0;
|
||||
// check change of the interval size and whether the interval size is too small to be extended further
|
||||
#define CHECK_INTV_CHANGE_2(iv, ov, end_pos) \
|
||||
if (ov.x[2] != iv.x[2]) \
|
||||
{ \
|
||||
\
|
||||
kv_push(bwtintv_t, *curr, iv); \
|
||||
if (ov.x[2] < min_intv) \
|
||||
break; \
|
||||
} \
|
||||
iv = ov; \
|
||||
iv.info = end_pos
|
||||
|
||||
#define PUSH_VAL_AND_SKIP_2(iv) \
|
||||
do \
|
||||
{ \
|
||||
kv_push(bwtintv_t, *curr, iv); \
|
||||
goto backward_search; \
|
||||
} while (0)
|
||||
|
||||
// 处理kmer对应的匹配信息
|
||||
for (curr->n = 0, j = 1; j < kmer_len; ++j)
|
||||
{
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
|
||||
CHECK_INTV_CHANGE_2(ik, ok1, x + j + 1);
|
||||
}
|
||||
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
|
||||
PUSH_VAL_AND_SKIP_2(ik);
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
|
||||
CHECK_INTV_CHANGE_2(ik, ok2, i + 2);
|
||||
#if 1
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
|
||||
//fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
||||
ret = (uint32_t)mt.info;
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
goto backward_search;
|
||||
}
|
||||
#if 1
|
||||
// 判断只有两个候选的情况
|
||||
else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) {
|
||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1);
|
||||
int left_ext;
|
||||
bwtint_t lmrp = lm->rm[0].rs;
|
||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
||||
|
||||
if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1);
|
||||
DIRECT_EXTEND_2_SEED(1, 0, x);
|
||||
}
|
||||
else
|
||||
{
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0);
|
||||
DIRECT_EXTEND_2_SEED(0, 1, x);
|
||||
}
|
||||
mt.info = mt.rm[0].qs;
|
||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
||||
mt.num_match = 2;
|
||||
mt.x[2] = 2;
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
|
||||
//fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
||||
//fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2);
|
||||
//s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
||||
//s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
||||
//fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
||||
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
goto backward_search;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
else if (q[i] < 4) // q[i+1] >= 4
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
|
||||
PUSH_VAL_AND_SKIP_2(ik);
|
||||
}
|
||||
else // q[i] >= 4
|
||||
{
|
||||
PUSH_VAL_AND_SKIP_2(ik);
|
||||
}
|
||||
}
|
||||
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
|
||||
{
|
||||
if (q[i] < 4)
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
|
||||
}
|
||||
else
|
||||
PUSH_VAL_AND_SKIP_2(ik);
|
||||
}
|
||||
if (i == len)
|
||||
kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
||||
|
||||
backward_search:
|
||||
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||
if (mt.num_match == 0)
|
||||
ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置
|
||||
else
|
||||
ret = (uint32_t)mt.info;
|
||||
// 按照种子进行遍历,反向扩展
|
||||
#define CHECK_ADD_MEM(pos, intv, mem) \
|
||||
if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) \
|
||||
{ \
|
||||
(intv).info |= (uint64_t)(pos) << 32; \
|
||||
kv_push(bwtintv_t, *mem, (intv)); \
|
||||
}
|
||||
// if (flag)
|
||||
//{
|
||||
// fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1));
|
||||
// flag = 0;
|
||||
// }
|
||||
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
|
||||
if (ok.x[2] < min_intv) \
|
||||
{ \
|
||||
CHECK_ADD_MEM(pos, intv, mem); \
|
||||
break; \
|
||||
}
|
||||
|
||||
int last_kmer_start = 0;
|
||||
for (j = 0; j < curr->n; ++j)
|
||||
{
|
||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
||||
if (p->info - x < HASH_KMER_LEN)
|
||||
{
|
||||
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
||||
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
|
||||
else
|
||||
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
||||
last_kmer_start = p->info - 1;
|
||||
i = 1;
|
||||
do
|
||||
{
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++);
|
||||
} while (ik.x[2] < min_intv);
|
||||
if (i > 2)
|
||||
continue;
|
||||
p->x[0] = ik.x[1];
|
||||
p->x[1] = ik.x[0];
|
||||
p->x[2] = ik.x[2];
|
||||
i = p->info - (kmer_len - i + 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = x - 1;
|
||||
}
|
||||
for (; i > 0; i -= 2)
|
||||
{
|
||||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
// fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
||||
ok2.info = p->info;
|
||||
*p = ok2;
|
||||
#if 0
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
int qs = i - 1;
|
||||
int qe = (int)ok2.info;
|
||||
if (mem->n > 0)
|
||||
{
|
||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
||||
int qel = (int)(mem->a[mem->n - 1].info);
|
||||
if (qsl <= qs && qel >= qe)
|
||||
break;
|
||||
}
|
||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
||||
// fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len)
|
||||
{
|
||||
int qs = i - 1;
|
||||
int qe = (int)ok2.info;
|
||||
if (mem->n > 0)
|
||||
{
|
||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
||||
int qel = (int)(mem->a[mem->n - 1].info);
|
||||
if (qsl <= qs && qel >= qe)
|
||||
break;
|
||||
}
|
||||
#if 1
|
||||
int left_ext;
|
||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1);
|
||||
|
||||
bwtint_t lmrp = lm->rm[0].rs;
|
||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
||||
|
||||
if (lmrp + qs == s1 + lm->rm[0].qs)
|
||||
{ // s1是lm的子集
|
||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1);
|
||||
DIRECT_EXTEND_2_SEED(1, 0, qs);
|
||||
}
|
||||
else
|
||||
{
|
||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0);
|
||||
DIRECT_EXTEND_2_SEED(0, 1, qs);
|
||||
}
|
||||
mt.info = mt.rm[0].qs;
|
||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
||||
mt.num_match = 2;
|
||||
mt.x[2] = 2;
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
// fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
||||
// fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2);
|
||||
// s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
||||
// s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
||||
// fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
goto fmt_smem_end;
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (q[i] < 4) // 只能扩展一个
|
||||
{
|
||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
CHECK_ADD_MEM(i, ok1, mem);
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
else
|
||||
{ // 不能扩展
|
||||
CHECK_ADD_MEM(i + 1, *p, mem);
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
}
|
||||
for (; i == 0; --i)
|
||||
{ // 扩展到了第一个碱基
|
||||
if (q[i] < 4)
|
||||
{
|
||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
*p = ok1;
|
||||
}
|
||||
else
|
||||
{
|
||||
CHECK_ADD_MEM(i + 1, *p, mem);
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
}
|
||||
if (i == -1)
|
||||
{
|
||||
CHECK_ADD_MEM(i + 1, *p, mem);
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
}
|
||||
|
||||
fmt_smem_end:
|
||||
//if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n");
|
||||
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -1572,9 +1119,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置
|
||||
ik.info = x + kmer_len;
|
||||
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
|
||||
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
|
||||
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
|
||||
{ \
|
||||
|
|
@ -1590,7 +1134,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
{
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
// fmt_direct_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
|
|
@ -1610,18 +1153,15 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
// fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
|
||||
ik = ok2;
|
||||
//fprintf(stderr, "d: %d %ld\n", i, ok2.x[2]);
|
||||
}
|
||||
else if (q[i] < 4) // q[i+1] >= 4
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
//fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]);
|
||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||
return i + 2;
|
||||
}
|
||||
|
|
@ -1633,7 +1173,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
if (i == len - 1 && q[i] < 4)
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
//fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]);
|
||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||
}
|
||||
return len;
|
||||
|
|
@ -1659,18 +1198,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
|
|||
*b1 = base2 & 3;
|
||||
}
|
||||
|
||||
// k, k1, k2都是bwt矩阵对应的行
|
||||
inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||
{
|
||||
uint8_t b1, b2;
|
||||
bwtint_t tk[4], kk;
|
||||
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1)
|
||||
fmt_get_previous_base(fmt, kk, &b1, &b2);
|
||||
fmt_e2_occ(fmt, k, b1, b2, tk);
|
||||
*k1 = fmt->L2[b1] + tk[1];
|
||||
*k2 = fmt->L2[b2] + tk[3];
|
||||
}
|
||||
|
||||
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||
{
|
||||
uint8_t b1, b2;
|
||||
|
|
@ -1741,7 +1268,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
|||
{
|
||||
++sa;
|
||||
fmt_previous_line(fmt, k, &k1, &k2);
|
||||
//fmt_previous_line_old(fmt, k, &k1, &k2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
|
||||
if (!(k1 & mask))
|
||||
{
|
||||
|
|
|
|||
|
|
@ -106,5 +106,6 @@ int display_stats(int nthreads)
|
|||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -26,8 +26,8 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
|
|||
#endif
|
||||
|
||||
#ifdef SHOW_DATA_PERF
|
||||
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
|
||||
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
|
||||
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
|
||||
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
|
||||
#endif
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
|
|
@ -81,7 +81,10 @@ enum
|
|||
T_BSW,
|
||||
T_BSW_ALL,
|
||||
T_SAM_MATESW,
|
||||
T_SAM_REG2ALN
|
||||
T_SAM_REG2ALN,
|
||||
T_SEED_3_1,
|
||||
T_SEED_3_2,
|
||||
T_SEED_3_3
|
||||
};
|
||||
|
||||
int display_stats(int);
|
||||
|
|
|
|||
14
run.sh
14
run.sh
|
|
@ -1,6 +1,8 @@
|
|||
thread=1
|
||||
|
||||
## d1
|
||||
## d1 k18<=4 89%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
|
||||
n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
|
||||
|
|
@ -9,22 +11,22 @@ n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
|
|||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
|
||||
## d2
|
||||
## d2 <= 4 87%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
|
||||
# d3
|
||||
# d3 <= 4 77%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
|
||||
## d4
|
||||
## d4 <= 4 93%
|
||||
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
|
||||
## d5
|
||||
## d5 <= 4 80%
|
||||
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
|
||||
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
||||
|
|
@ -38,7 +40,7 @@ reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
|
|||
|
||||
#out=~/data1/out-u8-1.sam
|
||||
#out=~/data1/out-i16.sam
|
||||
out=~/data1/out.sam
|
||||
out=~/data1/fmt-out.sam
|
||||
#out=/dev/null
|
||||
|
||||
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
|
||||
back_num = 0
|
||||
end_num = 0
|
||||
seed2_file = sys.argv[1]
|
||||
with open(seed2_file, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith("back"):
|
||||
back_num += int(line.split(' ')[1])
|
||||
elif line.startswith("end"):
|
||||
end_num += int(line.split(' ')[1])
|
||||
|
||||
print(back_num, end_num, end_num / back_num)
|
||||
Loading…
Reference in New Issue