clean代码,结果正确
This commit is contained in:
parent
8c2a90e3e2
commit
9c9502d584
|
|
@ -1,3 +1,4 @@
|
||||||
|
output/
|
||||||
*.[oa]
|
*.[oa]
|
||||||
*.txt
|
*.txt
|
||||||
*.sam
|
*.sam
|
||||||
|
|
|
||||||
|
|
@ -17,9 +17,9 @@
|
||||||
"-M",
|
"-M",
|
||||||
"-R",
|
"-R",
|
||||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||||
"~/reference/bwa/human_g1k_v37_decoy.fasta",
|
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta",
|
||||||
"~/data/fastq/dataset/na12878_wes_144/s_1.fq",
|
"~/data/fastq/dataset/na12878_wes_144/1w_1.fq",
|
||||||
"~/data/fastq/dataset/na12878_wes_144/s_2.fq",
|
"~/data/fastq/dataset/na12878_wes_144/1w_2.fq",
|
||||||
"-o",
|
"-o",
|
||||||
"/dev/null"
|
"/dev/null"
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,12 @@
|
||||||
"scalar_sse.h": "c",
|
"scalar_sse.h": "c",
|
||||||
"immintrin.h": "c",
|
"immintrin.h": "c",
|
||||||
"ksw.h": "c",
|
"ksw.h": "c",
|
||||||
"debug.h": "c"
|
"debug.h": "c",
|
||||||
|
"type_traits": "c",
|
||||||
|
"cstdint": "c",
|
||||||
|
"bitset": "c",
|
||||||
|
"iterator": "c",
|
||||||
|
"memory": "c",
|
||||||
|
"__locale": "c"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
4
Makefile
4
Makefile
|
|
@ -5,8 +5,8 @@ CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
|
||||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
|
|
||||||
SHOW_PERF= -DSHOW_PERF
|
SHOW_PERF= -DSHOW_PERF
|
||||||
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
|
SHOW_DATA_PERF= -DSHOW_DATA_PERF
|
||||||
FILTER_FULL_MATCH= -DFILTER_FULL_MATCH
|
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
||||||
USE_MT_READ= -DUSE_MT_READ
|
USE_MT_READ= -DUSE_MT_READ
|
||||||
|
|
||||||
AR= ar
|
AR= ar
|
||||||
|
|
|
||||||
191
bwamem.c
191
bwamem.c
|
|
@ -142,7 +142,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len,
|
||||||
kv_push(bwtintv_t, a->mem, *p);
|
kv_push(bwtintv_t, a->mem, *p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//break; // for test full match time
|
|
||||||
} else {
|
} else {
|
||||||
++x;
|
++x;
|
||||||
if (start_flag) ++start_N_num;
|
if (start_flag) ++start_N_num;
|
||||||
|
|
@ -202,7 +201,6 @@ KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
||||||
// return 1 if the seed is merged into the chain
|
// return 1 if the seed is merged into the chain
|
||||||
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
|
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
|
||||||
{
|
{
|
||||||
//return 0;
|
|
||||||
int64_t qend, rend, x, y;
|
int64_t qend, rend, x, y;
|
||||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||||
qend = last->qbeg + last->len;
|
qend = last->qbeg + last->len;
|
||||||
|
|
@ -340,30 +338,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
|
||||||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
|
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
|
||||||
mem_seed_t s;
|
mem_seed_t s;
|
||||||
s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||||||
|
|
||||||
//kv_push(uint64_t, v1, s.rbeg);
|
|
||||||
//fprintf(stderr, "%ld\t", s.rbeg);
|
|
||||||
//if (p->x[0] == 0)
|
|
||||||
// s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + p->x[2] - 1 - k);
|
|
||||||
//else
|
|
||||||
// s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
|
||||||
//kv_push(uint64_t, v2, s.rbeg);
|
|
||||||
//fprintf(stderr, "%ld\t", s.rbeg);
|
|
||||||
//s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + k);
|
|
||||||
//fprintf(stderr, "%ld\n", s.rbeg);
|
|
||||||
//if (s.rbeg < fmt->l_pac) { // 在正链
|
|
||||||
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
|
|
||||||
//} else { // 在反向互补链
|
|
||||||
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
|
|
||||||
//}
|
|
||||||
//s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
|
||||||
|
|
||||||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||||||
}
|
}
|
||||||
//fprintf(stderr, "\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//fprintf(stderr, "split\n");
|
|
||||||
kv_resize(mem_chain_t, chain, kb_size(tree));
|
kv_resize(mem_chain_t, chain, kb_size(tree));
|
||||||
|
|
||||||
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
|
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
|
||||||
|
|
@ -374,28 +352,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
|
||||||
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
||||||
kb_destroy(chn, tree);
|
kb_destroy(chn, tree);
|
||||||
|
|
||||||
#if 0
|
|
||||||
for (i = 0; i < chain.n; ++i)
|
|
||||||
{
|
|
||||||
fprintf(gf[0], "chain-begin: %d\n", i);
|
|
||||||
int j;
|
|
||||||
fprintf(gf[0], "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid);
|
|
||||||
for (j = 0; j < chain.a[i].n; ++j) {
|
|
||||||
fprintf(gf[0], "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len);
|
|
||||||
}
|
|
||||||
fprintf(gf[0], "chain-end\n");
|
|
||||||
}
|
|
||||||
fprintf(gf[0], "fun-end\n");
|
|
||||||
#endif
|
|
||||||
// ks_introsort(uint64_sort, v1.n, v1.a);
|
|
||||||
// ks_introsort(uint64_sort, v2.n, v2.a);
|
|
||||||
// for (i = 0; i < v1.n; ++i) {
|
|
||||||
// //fprintf(stderr, "%ld\t%ld\n", v1.a[i], v2.a[i]);
|
|
||||||
// if (v1.a[i] != v2.a[i]) {
|
|
||||||
// fprintf(stderr, "diff: %ld\t%ld\n", v1.a[i], v2.a[i]);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
return chain;
|
return chain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1270,21 +1226,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void worker1(void *data, int i, int tid)
|
static void worker_sam(void *data, int i, int tid)
|
||||||
{
|
|
||||||
mem_worker_t *w = (mem_worker_t*)data;
|
|
||||||
if (!(w->opt->flag&MEM_F_PE)) {
|
|
||||||
if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
|
|
||||||
w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);
|
|
||||||
} else {
|
|
||||||
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
|
|
||||||
w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);
|
|
||||||
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
|
|
||||||
w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void worker2(void *data, int i, int tid)
|
|
||||||
{
|
{
|
||||||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
||||||
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
||||||
|
|
@ -1346,8 +1288,6 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b
|
||||||
}
|
}
|
||||||
return w;
|
return w;
|
||||||
}
|
}
|
||||||
#include "khash.h"
|
|
||||||
KHASH_MAP_INIT_INT64(seed, uint64_t);
|
|
||||||
|
|
||||||
static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid)
|
static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid)
|
||||||
{
|
{
|
||||||
|
|
@ -1357,24 +1297,13 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
||||||
int max_seed_len = 0;
|
int max_seed_len = 0;
|
||||||
int start_N_num = 0, start_flag = 1;
|
int start_N_num = 0, start_flag = 1;
|
||||||
smem->mem.n = 0;
|
smem->mem.n = 0;
|
||||||
//int s1_num = 0;
|
|
||||||
// 1. first pass: find all SMEMs
|
// 1. first pass: find all SMEMs
|
||||||
|
// goto third_seed;
|
||||||
PROF_START(seed_1);
|
PROF_START(seed_1);
|
||||||
// fprintf(gf[0], "read\n");
|
|
||||||
while (x < len) {
|
while (x < len) {
|
||||||
if (seq[x] < 4) {
|
if (seq[x] < 4) {
|
||||||
start_flag = 0;
|
start_flag = 0;
|
||||||
#ifdef DEBUG_FILE_OUTPUT
|
|
||||||
#ifdef COUNT_SEED_LENGTH
|
|
||||||
int last_x = x;
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||||||
#ifdef DEBUG_FILE_OUTPUT
|
|
||||||
#ifdef COUNT_SEED_LENGTH
|
|
||||||
fprintf(gf[0], "%d\n", x - last_x);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (i = 0; i < a->mem1.n; ++i) {
|
for (i = 0; i < a->mem1.n; ++i) {
|
||||||
bwtintv_t *p = &a->mem1.a[i];
|
bwtintv_t *p = &a->mem1.a[i];
|
||||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||||
|
|
@ -1383,82 +1312,38 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
||||||
kv_push(bwtintv_t, smem->mem, *p);
|
kv_push(bwtintv_t, smem->mem, *p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef COUNT_SEED_LENGTH
|
// break; // for test
|
||||||
break; // for test full match time
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
++x;
|
++x;
|
||||||
if (start_flag) ++start_N_num;
|
if (start_flag) ++start_N_num;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SHOW_DATA_PERF
|
|
||||||
gdat[0] += 1; // read num ++
|
|
||||||
if (max_seed_len == len - start_N_num)
|
|
||||||
gdat[1] += 1; // seed-1 full match num ++
|
|
||||||
#endif
|
|
||||||
|
|
||||||
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
||||||
|
//goto third_seed;
|
||||||
#ifdef FILTER_FULL_MATCH
|
#ifdef FILTER_FULL_MATCH
|
||||||
if (max_seed_len == len - start_N_num) {
|
if (max_seed_len == len - start_N_num) {
|
||||||
|
|
||||||
#ifdef DEBUG_FILE_OUTPUT
|
|
||||||
if (start_N_num == 0) {
|
|
||||||
#ifdef GET_FULL_MATCH_READ
|
|
||||||
for (i = 0; i < len; ++i)
|
|
||||||
fprintf(gf[0], "%c", "ACGT"[seq[i]]);
|
|
||||||
fprintf(gf[0], "\n");
|
|
||||||
#endif
|
|
||||||
#ifdef SHOW_DATA_PERF
|
|
||||||
gdat[2]++;
|
|
||||||
if (gdat[2] % 100 == 0) {
|
|
||||||
fprintf(stderr, "reads num: %ld\n", gdat[2]);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
//goto third_seed;
|
|
||||||
goto collect_intv_end;
|
goto collect_intv_end;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
bwtintv_v mem2 = {0};
|
|
||||||
// 2. second pass: find MEMs inside a long SMEM
|
// 2. second pass: find MEMs inside a long SMEM
|
||||||
uint32_v qev;
|
|
||||||
PROF_START(seed_2);
|
PROF_START(seed_2);
|
||||||
old_n = smem->mem.n;
|
old_n = smem->mem.n;
|
||||||
//fprintf(gf[2], "seed-1\n");
|
|
||||||
for (k = 0; k < old_n; ++k) {
|
for (k = 0; k < old_n; ++k) {
|
||||||
bwtintv_t *p = &smem->mem.a[k];
|
bwtintv_t *p = &smem->mem.a[k];
|
||||||
int start = p->info>>32, end = (int32_t)p->info;
|
int start = p->info>>32, end = (int32_t)p->info;
|
||||||
if (end - start < split_len || p->x[2] > opt->split_width) continue;
|
if (end - start < split_len || p->x[2] > opt->split_width) continue;
|
||||||
//++ s1_num;
|
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
|
||||||
//fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", p->info >> 32, (int)p->info, p->x[0], p->x[1], p->x[2]);
|
|
||||||
// fprintf(gf[0], "start\n");
|
|
||||||
fmt_smem_2(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0], qev);
|
|
||||||
for (i = 0; i < a->mem1.n; ++i) {
|
for (i = 0; i < a->mem1.n; ++i) {
|
||||||
bwtintv_t *p = &a->mem1.a[i];
|
bwtintv_t *p = &a->mem1.a[i];
|
||||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||||
// fprintf(gf[0], "%d\n", slen);
|
|
||||||
if (slen >= opt->min_seed_len) {
|
if (slen >= opt->min_seed_len) {
|
||||||
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
|
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
|
||||||
kv_push(bwtintv_t, mem2, a->mem1.a[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// fprintf(gf[0], "end\n");
|
|
||||||
}
|
}
|
||||||
//fprintf(gf[2], "seed-2\n");
|
|
||||||
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
||||||
//if (s1_num > 1) {
|
|
||||||
//ks_introsort(mem_intv, mem2.n, mem2.a);
|
|
||||||
//for (i = 0; i < mem2.n; ++i) {
|
|
||||||
// fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", mem2.a[i].info >> 32, (int)mem2.a[i].info, mem2.a[i].x[0], mem2.a[i].x[1], mem2.a[i].x[2]);
|
|
||||||
//}
|
|
||||||
//fprintf(gf[2], "\n");
|
|
||||||
//}
|
|
||||||
|
|
||||||
// third_seed:
|
//third_seed:
|
||||||
#if 1
|
|
||||||
// 3. third pass: LAST-like
|
// 3. third pass: LAST-like
|
||||||
PROF_START(seed_3);
|
PROF_START(seed_3);
|
||||||
if (opt->max_mem_intv > 0) {
|
if (opt->max_mem_intv > 0) {
|
||||||
|
|
@ -1468,32 +1353,23 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
||||||
bwtintv_t m;
|
bwtintv_t m;
|
||||||
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||||
if (m.x[2] > 0) {
|
if (m.x[2] > 0) {
|
||||||
// fprintf(stderr, "3: %ld %ld %ld\n", m.info >> 32, m.info & ((1L << 32) - 1), m.x[2]);
|
|
||||||
kv_push(bwtintv_t, smem->mem, m);
|
kv_push(bwtintv_t, smem->mem, m);
|
||||||
}
|
}
|
||||||
} else ++x;
|
} else ++x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PROF_END(tprof[T_SEED_3][tid], seed_3);
|
PROF_END(tprof[T_SEED_3][tid], seed_3);
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef FILTER_FULL_MATCH
|
#ifdef FILTER_FULL_MATCH
|
||||||
collect_intv_end:
|
collect_intv_end:
|
||||||
#endif
|
#endif
|
||||||
// sort
|
// sort
|
||||||
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
|
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
|
||||||
|
|
||||||
|
|
||||||
khash_t(str) * h;
|
|
||||||
//for (i = 0; i < smem->mem.n; ++i) {
|
|
||||||
// fprintf(stderr, "seed %d: %ld %ld %ld\n", i, smem->mem.a[i].x[0], smem->mem.a[i].x[1], smem->mem.a[i].x[2]);
|
|
||||||
//}
|
|
||||||
//fprintf(stderr, "split\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)
|
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)
|
||||||
{
|
{
|
||||||
int i;
|
int i, j;
|
||||||
|
|
||||||
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
||||||
mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid);
|
mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid);
|
||||||
|
|
@ -1503,12 +1379,10 @@ void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t
|
||||||
int step, count; // seed length
|
int step, count; // seed length
|
||||||
int64_t k;
|
int64_t k;
|
||||||
if (p->num_match > 0) {
|
if (p->num_match > 0) {
|
||||||
uint64_t pos = p->rm[0].rs;
|
for (j = 0; j < p->num_match; ++j)
|
||||||
if (p->rm[0].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
{
|
||||||
kv_push(uint64_t, smemv->pos_arr, pos);
|
uint64_t pos = p->rm[j].rs;
|
||||||
if (p->num_match > 1) {
|
if (p->rm[j].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
||||||
uint64_t pos = p->rm[1].rs;
|
|
||||||
if (p->rm[1].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
|
||||||
kv_push(uint64_t, smemv->pos_arr, pos);
|
kv_push(uint64_t, smemv->pos_arr, pos);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1575,14 +1449,11 @@ void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *b
|
||||||
int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||||
int64_t k;
|
int64_t k;
|
||||||
if (p->num_match > 0) {
|
if (p->num_match > 0) {
|
||||||
|
|
||||||
|
for (k = 0; k < p->num_match; ++k)
|
||||||
|
{
|
||||||
mem_seed_t s;
|
mem_seed_t s;
|
||||||
s.rbeg = smemv->pos_arr.a[j++];
|
s.rbeg = smemv->pos_arr.a[j++];
|
||||||
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info>>32, (uint32_t)p->info);
|
|
||||||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
|
||||||
if (p->num_match > 1) {
|
|
||||||
mem_seed_t s;
|
|
||||||
s.rbeg = smemv->pos_arr.a[j++];
|
|
||||||
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info);
|
|
||||||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1740,36 +1611,6 @@ static void worker_smem_align(void *data, int i, int tid)
|
||||||
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void worker_sam(void *data, int i, int tid)
|
|
||||||
{
|
|
||||||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
|
||||||
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
|
||||||
extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size);
|
|
||||||
mem_worker_t *w = (mem_worker_t*)data;
|
|
||||||
int start = i*w->opt->batch_size;
|
|
||||||
int end = MIN(start + w->opt->batch_size, w->n_reads);
|
|
||||||
|
|
||||||
if (!(w->opt->flag&MEM_F_PE)) {
|
|
||||||
for (i=start; i<end; ++i) {
|
|
||||||
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
|
|
||||||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
|
|
||||||
if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
|
|
||||||
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]);
|
|
||||||
free(w->regs[i].a);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (!(w->opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
|
|
||||||
mem_matesw_batch(w->opt, w->bns, w->pac, w->pes, &w->seqs[start], &w->regs[start], end-start);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i=start; i<end; i+=2) {
|
|
||||||
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name);
|
|
||||||
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i], tid);
|
|
||||||
free(w->regs[i].a); free(w->regs[i+1].a);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
|
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
|
||||||
{
|
{
|
||||||
extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n);
|
extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n);
|
||||||
|
|
@ -1801,7 +1642,6 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
|
||||||
PROF_END(gprof[G_MEM_PREPARE], mem_prepare);
|
PROF_END(gprof[G_MEM_PREPARE], mem_prepare);
|
||||||
|
|
||||||
PROF_START(kernel);
|
PROF_START(kernel);
|
||||||
// kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
|
|
||||||
kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
|
kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
|
||||||
PROF_END(gprof[G_MEM_KERNEL], kernel);
|
PROF_END(gprof[G_MEM_KERNEL], kernel);
|
||||||
|
|
||||||
|
|
@ -1814,8 +1654,7 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
|
||||||
PROF_END(gprof[G_MEM_PESTAT], pestat);
|
PROF_END(gprof[G_MEM_PESTAT], pestat);
|
||||||
|
|
||||||
PROF_START(mem_sam);
|
PROF_START(mem_sam);
|
||||||
kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
|
kt_for(opt->n_threads, worker_sam, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
|
||||||
//kt_for(opt->n_threads, worker_sam, w, n_batch);
|
|
||||||
PROF_END(gprof[G_MEM_SAM], mem_sam);
|
PROF_END(gprof[G_MEM_SAM], mem_sam);
|
||||||
|
|
||||||
//free(w.regs);
|
//free(w.regs);
|
||||||
|
|
|
||||||
12
bwt.c
12
bwt.c
|
|
@ -402,28 +402,28 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
|
||||||
}
|
}
|
||||||
|
|
||||||
// 创建正向的kmer
|
// 创建正向的kmer
|
||||||
inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
inline uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
||||||
{
|
{
|
||||||
uint32_t qbit = 0, i;
|
uint64_t qbit = 0, i;
|
||||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||||
for (i = 0; i < qlen; ++i) {
|
for (i = 0; i < qlen; ++i) {
|
||||||
if (q[i] > 3) break; // 要考虑碱基是N
|
if (q[i] > 3) break; // 要考虑碱基是N
|
||||||
qbit |= q[i] << ((kmer_len - 1 - i) << 1);
|
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
|
||||||
}
|
}
|
||||||
*base_consumed = i;
|
*base_consumed = i;
|
||||||
return qbit;
|
return qbit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 创建f反向的kmer
|
// 创建f反向的kmer
|
||||||
inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
inline uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
||||||
{
|
{
|
||||||
uint32_t qbit = 0;
|
uint64_t qbit = 0;
|
||||||
int i, j, end_pos;
|
int i, j, end_pos;
|
||||||
end_pos = start_pos - kmer_len;
|
end_pos = start_pos - kmer_len;
|
||||||
end_pos = end_pos < 0 ? -1 : end_pos;
|
end_pos = end_pos < 0 ? -1 : end_pos;
|
||||||
for (i = start_pos, j = 0; i > end_pos; --i, ++j) {
|
for (i = start_pos, j = 0; i > end_pos; --i, ++j) {
|
||||||
if (q[i] > 3) break; // 要考虑碱基是N
|
if (q[i] > 3) break; // 要考虑碱基是N
|
||||||
qbit |= q[i] << ((kmer_len - 1 - j) << 1);
|
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - j) << 1);
|
||||||
}
|
}
|
||||||
*base_consumed = start_pos - i;
|
*base_consumed = start_pos - i;
|
||||||
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
||||||
|
|
|
||||||
4
bwt.h
4
bwt.h
|
|
@ -152,8 +152,8 @@ extern "C" {
|
||||||
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
|
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
|
||||||
void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos);
|
void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos);
|
||||||
|
|
||||||
uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
|
uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
|
||||||
uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
|
uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
|
||||||
|
|
||||||
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
|
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
|
||||||
void bwt_gen_cnt_table(bwt_t *bwt);
|
void bwt_gen_cnt_table(bwt_t *bwt);
|
||||||
|
|
|
||||||
|
|
@ -656,7 +656,7 @@ int main_mem(int argc, char *argv[])
|
||||||
|
|
||||||
#ifdef SHOW_DATA_PERF
|
#ifdef SHOW_DATA_PERF
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]);
|
// fprintf(stderr, "full_match: %ld, all: %ld\n", gdat[1], gdat[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
478
fmt_idx.c
478
fmt_idx.c
|
|
@ -837,8 +837,6 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
|
||||||
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
PUSH_VAL_AND_SKIP_FORWARD(ik);
|
||||||
|
|
||||||
// 扩展kmer之后的碱基
|
// 扩展kmer之后的碱基
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
|
||||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||||
{ // forward search
|
{ // forward search
|
||||||
if (q[i] < 4 && q[i + 1] < 4)
|
if (q[i] < 4 && q[i + 1] < 4)
|
||||||
|
|
@ -846,20 +844,11 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
|
||||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||||
{
|
{
|
||||||
#ifdef DEBUG_FILE_OUTPUT
|
|
||||||
#ifdef COUNT_SEED_LENGTH
|
|
||||||
fprintf(gf[0], "%d\t", i + 2 - x);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
||||||
#ifdef DEBUG_FILE_OUTPUT
|
|
||||||
#if 0
|
|
||||||
fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
kv_push(bwtintv_t, *mem, mt);
|
kv_push(bwtintv_t, *mem, mt);
|
||||||
ret = (uint32_t)mt.info;
|
ret = (uint32_t)mt.info;
|
||||||
goto fmt_smem_forward_end;
|
goto fmt_smem_forward_end;
|
||||||
|
|
@ -939,7 +928,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
||||||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
|
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
//int print_flag = 0;
|
|
||||||
// check change of the interval size and whether the interval size is too small to be extended further
|
// check change of the interval size and whether the interval size is too small to be extended further
|
||||||
#define CHECK_INTV_CHANGE(iv, ov, end_pos) \
|
#define CHECK_INTV_CHANGE(iv, ov, end_pos) \
|
||||||
if (ov.x[2] != iv.x[2]) \
|
if (ov.x[2] != iv.x[2]) \
|
||||||
|
|
@ -958,22 +946,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
||||||
goto backward_search; \
|
goto backward_search; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define DIRECT_EXTEND_2_SEED(idx0, idx1, qspos) \
|
|
||||||
mt.rm[idx1].qs = MAX(mt.rm[idx0].qs, lm->rm[0].qs); \
|
|
||||||
mt.rm[idx1].qe = MIN(mt.rm[idx0].qe, lm->rm[0].qe); \
|
|
||||||
mt.rm[idx0].qs = mt.rm[idx1].qs; \
|
|
||||||
mt.rm[idx0].qe = mt.rm[idx1].qe; \
|
|
||||||
mt.rm[idx1].reverse = lm->rm[0].reverse; \
|
|
||||||
left_ext = qspos - mt.rm[0].qs; \
|
|
||||||
if (mt.rm[0].reverse) \
|
|
||||||
mt.rm[0].rs = (fmt->l_pac << 1) - 1 - (s1 - left_ext); \
|
|
||||||
else \
|
|
||||||
mt.rm[0].rs = s1 - left_ext; \
|
|
||||||
if (mt.rm[1].reverse) \
|
|
||||||
mt.rm[1].rs = (fmt->l_pac << 1) - 1 - (s2 - left_ext); \
|
|
||||||
else \
|
|
||||||
mt.rm[1].rs = s2 - left_ext;
|
|
||||||
|
|
||||||
// 处理kmer对应的匹配信息
|
// 处理kmer对应的匹配信息
|
||||||
for (curr->n = 0, j = 1; j < kmer_len; ++j)
|
for (curr->n = 0, j = 1; j < kmer_len; ++j)
|
||||||
{
|
{
|
||||||
|
|
@ -984,8 +956,6 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
||||||
PUSH_VAL_AND_SKIP(ik);
|
PUSH_VAL_AND_SKIP(ik);
|
||||||
|
|
||||||
// 扩展kmer之后的碱基
|
// 扩展kmer之后的碱基
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
|
||||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||||
{ // forward search
|
{ // forward search
|
||||||
if (q[i] < 4 && q[i + 1] < 4)
|
if (q[i] < 4 && q[i + 1] < 4)
|
||||||
|
|
@ -1000,49 +970,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
||||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||||
{
|
{
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
|
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
|
||||||
//fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
|
||||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
||||||
ret = (uint32_t)mt.info;
|
ret = (uint32_t)mt.info;
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||||
goto fmt_smem_end;
|
goto fmt_smem_end;
|
||||||
goto backward_search;
|
goto backward_search;
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
// 判断只有两个候选的情况
|
|
||||||
else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) {
|
|
||||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
|
||||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1);
|
|
||||||
int left_ext;
|
|
||||||
bwtint_t lmrp = lm->rm[0].rs;
|
|
||||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
|
||||||
|
|
||||||
if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集
|
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1);
|
|
||||||
DIRECT_EXTEND_2_SEED(1, 0, x);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0);
|
|
||||||
DIRECT_EXTEND_2_SEED(0, 1, x);
|
|
||||||
}
|
|
||||||
mt.info = mt.rm[0].qs;
|
|
||||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
|
||||||
mt.num_match = 2;
|
|
||||||
mt.x[2] = 2;
|
|
||||||
kv_push(bwtintv_t, *mem, mt);
|
|
||||||
ret = (uint32_t)mt.info;
|
|
||||||
|
|
||||||
//fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
|
||||||
//fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2);
|
|
||||||
//s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
|
||||||
//s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
|
||||||
//fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
|
||||||
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
goto backward_search;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else if (q[i] < 4) // q[i+1] >= 4
|
else if (q[i] < 4) // q[i+1] >= 4
|
||||||
|
|
@ -1082,11 +1015,7 @@ backward_search:
|
||||||
(intv).info |= (uint64_t)(pos) << 32; \
|
(intv).info |= (uint64_t)(pos) << 32; \
|
||||||
kv_push(bwtintv_t, *mem, (intv)); \
|
kv_push(bwtintv_t, *mem, (intv)); \
|
||||||
}
|
}
|
||||||
// if (flag)
|
|
||||||
//{
|
|
||||||
// fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1));
|
|
||||||
// flag = 0;
|
|
||||||
// }
|
|
||||||
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
|
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
|
||||||
if (ok.x[2] < min_intv) \
|
if (ok.x[2] < min_intv) \
|
||||||
{ \
|
{ \
|
||||||
|
|
@ -1098,8 +1027,6 @@ backward_search:
|
||||||
for (j = 0; j < curr->n; ++j)
|
for (j = 0; j < curr->n; ++j)
|
||||||
{
|
{
|
||||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
|
||||||
if (p->info - x < HASH_KMER_LEN)
|
if (p->info - x < HASH_KMER_LEN)
|
||||||
{
|
{
|
||||||
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
||||||
|
|
@ -1127,7 +1054,6 @@ backward_search:
|
||||||
{
|
{
|
||||||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
||||||
{
|
{
|
||||||
// fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
|
||||||
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||||
|
|
@ -1136,77 +1062,9 @@ backward_search:
|
||||||
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
||||||
ok2.info = p->info;
|
ok2.info = p->info;
|
||||||
*p = ok2;
|
*p = ok2;
|
||||||
#if 0
|
|
||||||
// 在这里进行判断是否只有一个候选了
|
|
||||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
|
||||||
{
|
|
||||||
int qs = i - 1;
|
|
||||||
int qe = (int)ok2.info;
|
|
||||||
if (mem->n > 0)
|
|
||||||
{
|
|
||||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
|
||||||
int qel = (int)(mem->a[mem->n - 1].info);
|
|
||||||
if (qsl <= qs && qel >= qe)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
|
||||||
// fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
|
||||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if 0
|
|
||||||
if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len)
|
|
||||||
{
|
|
||||||
int qs = i - 1;
|
|
||||||
int qe = (int)ok2.info;
|
|
||||||
if (mem->n > 0)
|
|
||||||
{
|
|
||||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
|
||||||
int qel = (int)(mem->a[mem->n - 1].info);
|
|
||||||
if (qsl <= qs && qel >= qe)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#if 1
|
|
||||||
int left_ext;
|
|
||||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
|
||||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1);
|
|
||||||
|
|
||||||
bwtint_t lmrp = lm->rm[0].rs;
|
|
||||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
|
||||||
|
|
||||||
if (lmrp + qs == s1 + lm->rm[0].qs)
|
|
||||||
{ // s1是lm的子集
|
|
||||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1);
|
|
||||||
DIRECT_EXTEND_2_SEED(1, 0, qs);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0);
|
|
||||||
DIRECT_EXTEND_2_SEED(0, 1, qs);
|
|
||||||
}
|
|
||||||
mt.info = mt.rm[0].qs;
|
|
||||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
|
||||||
mt.num_match = 2;
|
|
||||||
mt.x[2] = 2;
|
|
||||||
kv_push(bwtintv_t, *mem, mt);
|
|
||||||
// fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
|
||||||
// fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2);
|
|
||||||
// s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
|
||||||
// s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
|
||||||
// fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
else if (q[i] < 4) // 只能扩展一个
|
else if (q[i] < 4) // 只能扩展一个
|
||||||
{
|
{
|
||||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
|
||||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
||||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||||
ok1.info = p->info;
|
ok1.info = p->info;
|
||||||
|
|
@ -1223,7 +1081,6 @@ backward_search:
|
||||||
{ // 扩展到了第一个碱基
|
{ // 扩展到了第一个碱基
|
||||||
if (q[i] < 4)
|
if (q[i] < 4)
|
||||||
{
|
{
|
||||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
|
||||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
||||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||||
ok1.info = p->info;
|
ok1.info = p->info;
|
||||||
|
|
@ -1243,316 +1100,6 @@ backward_search:
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt_smem_end:
|
fmt_smem_end:
|
||||||
//if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n");
|
|
||||||
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 找smem(seed)(lm: long_smem)
|
|
||||||
int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev)
|
|
||||||
{
|
|
||||||
// int flag = 0;
|
|
||||||
int i, j, ret, kmer_len;
|
|
||||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
|
||||||
bwtintv_t mt = {0};
|
|
||||||
bwtintv_v *curr;
|
|
||||||
uint32_t qbit = 0;
|
|
||||||
mem->n = 0;
|
|
||||||
curr = tmpvec; // use the temporary vector if provided
|
|
||||||
|
|
||||||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
|
||||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
|
|
||||||
ik.info = x + 1;
|
|
||||||
//int print_flag = 0;
|
|
||||||
// check change of the interval size and whether the interval size is too small to be extended further
|
|
||||||
#define CHECK_INTV_CHANGE_2(iv, ov, end_pos) \
|
|
||||||
if (ov.x[2] != iv.x[2]) \
|
|
||||||
{ \
|
|
||||||
\
|
|
||||||
kv_push(bwtintv_t, *curr, iv); \
|
|
||||||
if (ov.x[2] < min_intv) \
|
|
||||||
break; \
|
|
||||||
} \
|
|
||||||
iv = ov; \
|
|
||||||
iv.info = end_pos
|
|
||||||
|
|
||||||
#define PUSH_VAL_AND_SKIP_2(iv) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
kv_push(bwtintv_t, *curr, iv); \
|
|
||||||
goto backward_search; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
// 处理kmer对应的匹配信息
|
|
||||||
for (curr->n = 0, j = 1; j < kmer_len; ++j)
|
|
||||||
{
|
|
||||||
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
|
|
||||||
CHECK_INTV_CHANGE_2(ik, ok1, x + j + 1);
|
|
||||||
}
|
|
||||||
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
|
|
||||||
PUSH_VAL_AND_SKIP_2(ik);
|
|
||||||
|
|
||||||
// 扩展kmer之后的碱基
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
|
||||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
|
||||||
{ // forward search
|
|
||||||
if (q[i] < 4 && q[i + 1] < 4)
|
|
||||||
{
|
|
||||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
|
||||||
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
|
|
||||||
CHECK_INTV_CHANGE_2(ik, ok2, i + 2);
|
|
||||||
#if 1
|
|
||||||
// 在这里进行判断是否只有一个候选了
|
|
||||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
|
||||||
{
|
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0);
|
|
||||||
//fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
|
||||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
|
||||||
ret = (uint32_t)mt.info;
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
goto backward_search;
|
|
||||||
}
|
|
||||||
#if 1
|
|
||||||
// 判断只有两个候选的情况
|
|
||||||
else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) {
|
|
||||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
|
||||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1);
|
|
||||||
int left_ext;
|
|
||||||
bwtint_t lmrp = lm->rm[0].rs;
|
|
||||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
|
||||||
|
|
||||||
if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集
|
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1);
|
|
||||||
DIRECT_EXTEND_2_SEED(1, 0, x);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0);
|
|
||||||
DIRECT_EXTEND_2_SEED(0, 1, x);
|
|
||||||
}
|
|
||||||
mt.info = mt.rm[0].qs;
|
|
||||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
|
||||||
mt.num_match = 2;
|
|
||||||
mt.x[2] = 2;
|
|
||||||
kv_push(bwtintv_t, *mem, mt);
|
|
||||||
ret = (uint32_t)mt.info;
|
|
||||||
|
|
||||||
//fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
|
||||||
//fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2);
|
|
||||||
//s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
|
||||||
//s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
|
||||||
//fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
|
||||||
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
goto backward_search;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else if (q[i] < 4) // q[i+1] >= 4
|
|
||||||
{
|
|
||||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
|
||||||
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
|
|
||||||
PUSH_VAL_AND_SKIP_2(ik);
|
|
||||||
}
|
|
||||||
else // q[i] >= 4
|
|
||||||
{
|
|
||||||
PUSH_VAL_AND_SKIP_2(ik);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
|
|
||||||
{
|
|
||||||
if (q[i] < 4)
|
|
||||||
{
|
|
||||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
|
||||||
CHECK_INTV_CHANGE_2(ik, ok1, i + 1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
PUSH_VAL_AND_SKIP_2(ik);
|
|
||||||
}
|
|
||||||
if (i == len)
|
|
||||||
kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
|
||||||
|
|
||||||
backward_search:
|
|
||||||
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
|
||||||
if (mt.num_match == 0)
|
|
||||||
ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置
|
|
||||||
else
|
|
||||||
ret = (uint32_t)mt.info;
|
|
||||||
// 按照种子进行遍历,反向扩展
|
|
||||||
#define CHECK_ADD_MEM(pos, intv, mem) \
|
|
||||||
if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) \
|
|
||||||
{ \
|
|
||||||
(intv).info |= (uint64_t)(pos) << 32; \
|
|
||||||
kv_push(bwtintv_t, *mem, (intv)); \
|
|
||||||
}
|
|
||||||
// if (flag)
|
|
||||||
//{
|
|
||||||
// fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1));
|
|
||||||
// flag = 0;
|
|
||||||
// }
|
|
||||||
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
|
|
||||||
if (ok.x[2] < min_intv) \
|
|
||||||
{ \
|
|
||||||
CHECK_ADD_MEM(pos, intv, mem); \
|
|
||||||
break; \
|
|
||||||
}
|
|
||||||
|
|
||||||
int last_kmer_start = 0;
|
|
||||||
for (j = 0; j < curr->n; ++j)
|
|
||||||
{
|
|
||||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
|
||||||
if (p->info - x < HASH_KMER_LEN)
|
|
||||||
{
|
|
||||||
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
|
||||||
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
|
|
||||||
else
|
|
||||||
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
|
||||||
last_kmer_start = p->info - 1;
|
|
||||||
i = 1;
|
|
||||||
do
|
|
||||||
{
|
|
||||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++);
|
|
||||||
} while (ik.x[2] < min_intv);
|
|
||||||
if (i > 2)
|
|
||||||
continue;
|
|
||||||
p->x[0] = ik.x[1];
|
|
||||||
p->x[1] = ik.x[0];
|
|
||||||
p->x[2] = ik.x[2];
|
|
||||||
i = p->info - (kmer_len - i + 3);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
i = x - 1;
|
|
||||||
}
|
|
||||||
for (; i > 0; i -= 2)
|
|
||||||
{
|
|
||||||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
|
||||||
{
|
|
||||||
// fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
|
||||||
fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
|
||||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
|
||||||
ok1.info = p->info;
|
|
||||||
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
|
||||||
ok2.info = p->info;
|
|
||||||
*p = ok2;
|
|
||||||
#if 0
|
|
||||||
// 在这里进行判断是否只有一个候选了
|
|
||||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
|
||||||
{
|
|
||||||
int qs = i - 1;
|
|
||||||
int qe = (int)ok2.info;
|
|
||||||
if (mem->n > 0)
|
|
||||||
{
|
|
||||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
|
||||||
int qel = (int)(mem->a[mem->n - 1].info);
|
|
||||||
if (qsl <= qs && qel >= qe)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0);
|
|
||||||
// fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
|
||||||
kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if 0
|
|
||||||
if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len)
|
|
||||||
{
|
|
||||||
int qs = i - 1;
|
|
||||||
int qe = (int)ok2.info;
|
|
||||||
if (mem->n > 0)
|
|
||||||
{
|
|
||||||
int qsl = (int)(mem->a[mem->n - 1].info >> 32);
|
|
||||||
int qel = (int)(mem->a[mem->n - 1].info);
|
|
||||||
if (qsl <= qs && qel >= qe)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#if 1
|
|
||||||
int left_ext;
|
|
||||||
bwtint_t s1 = fmt_sa(fmt, ok2.x[0]);
|
|
||||||
bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1);
|
|
||||||
|
|
||||||
bwtint_t lmrp = lm->rm[0].rs;
|
|
||||||
if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp;
|
|
||||||
|
|
||||||
if (lmrp + qs == s1 + lm->rm[0].qs)
|
|
||||||
{ // s1是lm的子集
|
|
||||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1);
|
|
||||||
DIRECT_EXTEND_2_SEED(1, 0, qs);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0);
|
|
||||||
DIRECT_EXTEND_2_SEED(0, 1, qs);
|
|
||||||
}
|
|
||||||
mt.info = mt.rm[0].qs;
|
|
||||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
|
||||||
mt.num_match = 2;
|
|
||||||
mt.x[2] = 2;
|
|
||||||
kv_push(bwtintv_t, *mem, mt);
|
|
||||||
// fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse);
|
|
||||||
// fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2);
|
|
||||||
// s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1;
|
|
||||||
// s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2;
|
|
||||||
// fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2);
|
|
||||||
if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
|
||||||
goto fmt_smem_end;
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else if (q[i] < 4) // 只能扩展一个
|
|
||||||
{
|
|
||||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
|
||||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
|
||||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
|
||||||
ok1.info = p->info;
|
|
||||||
CHECK_ADD_MEM(i, ok1, mem);
|
|
||||||
goto fmt_smem_end;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{ // 不能扩展
|
|
||||||
CHECK_ADD_MEM(i + 1, *p, mem);
|
|
||||||
goto fmt_smem_end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (; i == 0; --i)
|
|
||||||
{ // 扩展到了第一个碱基
|
|
||||||
if (q[i] < 4)
|
|
||||||
{
|
|
||||||
// fmt_extend1(fmt, p, &ok1, 1, q[i]);
|
|
||||||
fmt_direct_extend1(fmt, p, &ok1, 1, q[i]);
|
|
||||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
|
||||||
ok1.info = p->info;
|
|
||||||
*p = ok1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
CHECK_ADD_MEM(i + 1, *p, mem);
|
|
||||||
goto fmt_smem_end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (i == -1)
|
|
||||||
{
|
|
||||||
CHECK_ADD_MEM(i + 1, *p, mem);
|
|
||||||
goto fmt_smem_end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt_smem_end:
|
|
||||||
//if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n");
|
|
||||||
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
@ -1572,9 +1119,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
||||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置
|
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - 1); // 初始碱基位置
|
||||||
ik.info = x + kmer_len;
|
ik.info = x + kmer_len;
|
||||||
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
|
||||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
|
||||||
|
|
||||||
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
|
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
|
||||||
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
|
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
|
||||||
{ \
|
{ \
|
||||||
|
|
@ -1590,7 +1134,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
||||||
{
|
{
|
||||||
if (q[i] < 4 && q[i + 1] < 4)
|
if (q[i] < 4 && q[i + 1] < 4)
|
||||||
{
|
{
|
||||||
// fmt_direct_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
|
||||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||||
|
|
@ -1610,18 +1153,15 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
||||||
if (q[i] < 4 && q[i + 1] < 4)
|
if (q[i] < 4 && q[i + 1] < 4)
|
||||||
{
|
{
|
||||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||||
// fmt_direct2_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||||
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
|
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
|
||||||
ik = ok2;
|
ik = ok2;
|
||||||
//fprintf(stderr, "d: %d %ld\n", i, ok2.x[2]);
|
|
||||||
}
|
}
|
||||||
else if (q[i] < 4) // q[i+1] >= 4
|
else if (q[i] < 4) // q[i+1] >= 4
|
||||||
{
|
{
|
||||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||||
//fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]);
|
|
||||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||||
return i + 2;
|
return i + 2;
|
||||||
}
|
}
|
||||||
|
|
@ -1633,7 +1173,6 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
||||||
if (i == len - 1 && q[i] < 4)
|
if (i == len - 1 && q[i] < 4)
|
||||||
{
|
{
|
||||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||||
//fprintf(stderr, "d: %d %ld\n", i, ok1.x[2]);
|
|
||||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||||
}
|
}
|
||||||
return len;
|
return len;
|
||||||
|
|
@ -1659,18 +1198,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
|
||||||
*b1 = base2 & 3;
|
*b1 = base2 & 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
// k, k1, k2都是bwt矩阵对应的行
|
|
||||||
inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
|
||||||
{
|
|
||||||
uint8_t b1, b2;
|
|
||||||
bwtint_t tk[4], kk;
|
|
||||||
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1)
|
|
||||||
fmt_get_previous_base(fmt, kk, &b1, &b2);
|
|
||||||
fmt_e2_occ(fmt, k, b1, b2, tk);
|
|
||||||
*k1 = fmt->L2[b1] + tk[1];
|
|
||||||
*k2 = fmt->L2[b2] + tk[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||||
{
|
{
|
||||||
uint8_t b1, b2;
|
uint8_t b1, b2;
|
||||||
|
|
@ -1741,7 +1268,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
||||||
{
|
{
|
||||||
++sa;
|
++sa;
|
||||||
fmt_previous_line(fmt, k, &k1, &k2);
|
fmt_previous_line(fmt, k, &k1, &k2);
|
||||||
//fmt_previous_line_old(fmt, k, &k1, &k2);
|
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
|
||||||
if (!(k1 & mask))
|
if (!(k1 & mask))
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -106,5 +106,6 @@ int display_stats(int nthreads)
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -26,8 +26,8 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SHOW_DATA_PERF
|
#ifdef SHOW_DATA_PERF
|
||||||
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0};
|
extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD];
|
||||||
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0};
|
extern int64_t gdat[LIM_GLOBAL_DATA_TYPE];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SHOW_PERF
|
#ifdef SHOW_PERF
|
||||||
|
|
@ -81,7 +81,10 @@ enum
|
||||||
T_BSW,
|
T_BSW,
|
||||||
T_BSW_ALL,
|
T_BSW_ALL,
|
||||||
T_SAM_MATESW,
|
T_SAM_MATESW,
|
||||||
T_SAM_REG2ALN
|
T_SAM_REG2ALN,
|
||||||
|
T_SEED_3_1,
|
||||||
|
T_SEED_3_2,
|
||||||
|
T_SEED_3_3
|
||||||
};
|
};
|
||||||
|
|
||||||
int display_stats(int);
|
int display_stats(int);
|
||||||
|
|
|
||||||
14
run.sh
14
run.sh
|
|
@ -1,6 +1,8 @@
|
||||||
thread=1
|
thread=1
|
||||||
|
|
||||||
## d1
|
## d1 k18<=4 89%
|
||||||
|
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
|
||||||
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
|
||||||
n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
|
n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
|
||||||
|
|
@ -9,22 +11,22 @@ n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
|
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
|
||||||
## d2
|
## d2 <= 4 87%
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
|
||||||
# d3
|
# d3 <= 4 77%
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
|
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
|
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
|
||||||
## d4
|
## d4 <= 4 93%
|
||||||
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
||||||
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
|
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
|
||||||
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
|
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
|
||||||
## d5
|
## d5 <= 4 80%
|
||||||
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
|
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
|
||||||
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
|
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
|
||||||
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
||||||
|
|
@ -38,7 +40,7 @@ reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
|
||||||
|
|
||||||
#out=~/data1/out-u8-1.sam
|
#out=~/data1/out-u8-1.sam
|
||||||
#out=~/data1/out-i16.sam
|
#out=~/data1/out-i16.sam
|
||||||
out=~/data1/out.sam
|
out=~/data1/fmt-out.sam
|
||||||
#out=/dev/null
|
#out=/dev/null
|
||||||
|
|
||||||
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
#!/bin/env python3
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
back_num = 0
|
||||||
|
end_num = 0
|
||||||
|
seed2_file = sys.argv[1]
|
||||||
|
with open(seed2_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("back"):
|
||||||
|
back_num += int(line.split(' ')[1])
|
||||||
|
elif line.startswith("end"):
|
||||||
|
end_num += int(line.split(' ')[1])
|
||||||
|
|
||||||
|
print(back_num, end_num, end_num / back_num)
|
||||||
Loading…
Reference in New Issue