整合了ert seeding,结果正确,完善了shm
This commit is contained in:
parent
364ee9756e
commit
b50360bb48
|
|
@ -21,7 +21,8 @@
|
|||
"~/data/fastq/dataset/na12878_wes_144/1w_1.fq",
|
||||
"~/data/fastq/dataset/na12878_wes_144/1w_2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
"/dev/null",
|
||||
"-Z"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
|
|
@ -58,6 +59,19 @@
|
|||
"program": "${workspaceRoot}/fastbwa",
|
||||
"args": [
|
||||
"shm",
|
||||
"-Z",
|
||||
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
{
|
||||
"name": "pac to bref",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/fastbwa",
|
||||
"args": [
|
||||
"pac2bref",
|
||||
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@
|
|||
"bntseq.h": "c",
|
||||
"inttypes.h": "c",
|
||||
"ertindex.h": "c",
|
||||
"ertseeding.h": "c"
|
||||
"ertseeding.h": "c",
|
||||
"algorithm": "c",
|
||||
"filesystem": "c"
|
||||
}
|
||||
}
|
||||
2
Makefile
2
Makefile
|
|
@ -1,5 +1,5 @@
|
|||
CC= gcc
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 #-O2
|
||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
|
||||
SHOW_PERF= -DSHOW_PERF
|
||||
|
|
|
|||
33
bwa.c
33
bwa.c
|
|
@ -43,7 +43,7 @@
|
|||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int bwa_verbose = 4;
|
||||
int bwa_verbose = 3;
|
||||
int bwa_dbg = 0;
|
||||
char bwa_rg_id[256];
|
||||
char *bwa_pg;
|
||||
|
|
@ -514,9 +514,11 @@ bwaidx_t *bwa_ertidx_load_from_disk(const char *hint)
|
|||
{
|
||||
char kmer_tbl_file_name[PATH_MAX];
|
||||
char ml_tbl_file_name[PATH_MAX];
|
||||
char bref_file_name[PATH_MAX];
|
||||
sprintf(kmer_tbl_file_name, "%s.ert.kmer.table", prefix);
|
||||
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
|
||||
FILE *kmer_tbl_fd, *ml_tbl_fd;
|
||||
sprintf(bref_file_name, "%s.ert.0123", prefix);
|
||||
FILE *kmer_tbl_fd, *ml_tbl_fd, *bref_fd;
|
||||
kmer_tbl_fd = fopen(kmer_tbl_file_name, "rb");
|
||||
if (kmer_tbl_fd == NULL) {
|
||||
fprintf(stderr, "[M::%s::ERT] Can't open k-mer index\n.", __func__);
|
||||
|
|
@ -558,10 +560,27 @@ bwaidx_t *bwa_ertidx_load_from_disk(const char *hint)
|
|||
fprintf(stderr, "[M::%s::ERT] Reading multi-level tree index to memory\n", __func__);
|
||||
}
|
||||
err_fread_noeof(idx->ert->mlt_table, sizeof(uint8_t), size, ml_tbl_fd);
|
||||
|
||||
// Read binary ref
|
||||
bref_fd = fopen(bref_file_name, "rb");
|
||||
if (bref_fd == NULL) {
|
||||
fprintf(stderr, "[M::%s::ERT] Can't open binary reference file\n.", __func__);
|
||||
exit(1);
|
||||
}
|
||||
fseek(bref_fd, 0L, SEEK_END);
|
||||
idx->ert->bref_size = ftell(bref_fd);
|
||||
if (bwa_verbose >= 3) {
|
||||
fprintf(stderr, "[M::%s::ERT] Reading binary reference to memory\n", __func__);
|
||||
}
|
||||
allocMem += idx->ert->bref_size;
|
||||
idx->ert->bref = (uint8_t *)malloc(idx->ert->bref_size * sizeof(uint8_t));
|
||||
fseek(bref_fd, 0L, SEEK_SET);
|
||||
err_fread_noeof(idx->ert->bref, sizeof(uint8_t), idx->ert->bref_size, bref_fd);
|
||||
|
||||
fclose(kmer_tbl_fd);
|
||||
fclose(ml_tbl_fd);
|
||||
if (bwa_verbose >= 3)
|
||||
{
|
||||
fclose(bref_fd);
|
||||
if (bwa_verbose >= 3) {
|
||||
fprintf(stderr, "[M::%s::ERT] Index tables loaded in %.3f CPU sec, %.3f real sec...\n", __func__, cputime() - ctime, realtime() - rtime);
|
||||
}
|
||||
}
|
||||
|
|
@ -713,6 +732,7 @@ int bwa_mem2ertidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
|||
x = sizeof(ERT); idx->ert = malloc(x); memcpy(idx->ert, mem + k, x); k += x;
|
||||
x = idx->ert->mlt_size; idx->ert->mlt_table = (uint8_t *)(mem + k); k += x;
|
||||
x = idx->ert->kmer_size; idx->ert->kmer_offsets = (uint64_t *)(mem + k); k += x;
|
||||
// x = idx->ert->bref_size; idx->ert->bref = (uint8_t *)(mem + k); k += x;
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
mem_to_bnspac(idx, &mem, &k);
|
||||
|
|
@ -774,9 +794,14 @@ int bwa_ertidx2mem(bwaidx_t *idx)
|
|||
x = idx->ert->kmer_size;
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->ert->kmer_offsets, x); k += x;
|
||||
free(idx->ert->kmer_offsets); idx->ert->kmer_offsets = 0;
|
||||
|
||||
x = idx->ert->bref_size;
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->ert->bref, x); k += x;
|
||||
free(idx->ert->bref); idx->ert->bref = 0;
|
||||
free(idx->ert);
|
||||
idx->ert = 0;
|
||||
|
||||
|
||||
// copy idx->bns
|
||||
move_bns_to_mem(idx, &mem, &k);
|
||||
// copy idx->pac
|
||||
|
|
|
|||
2
bwa.h
2
bwa.h
|
|
@ -51,8 +51,10 @@
|
|||
typedef struct {
|
||||
uint64_t *kmer_offsets; // ert kmer
|
||||
uint8_t *mlt_table; // ert mlt
|
||||
uint8_t *bref; // binary ref
|
||||
uint64_t kmer_size;
|
||||
uint64_t mlt_size;
|
||||
uint64_t bref_size;
|
||||
} ERT;
|
||||
|
||||
typedef struct {
|
||||
|
|
|
|||
270
bwamem.c
270
bwamem.c
|
|
@ -45,6 +45,7 @@
|
|||
#include "fmt_idx.h"
|
||||
#include "profiling.h"
|
||||
#include "debug.h"
|
||||
#include "ertseeding.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
|
|
@ -75,6 +76,9 @@
|
|||
|
||||
static const bntseq_t *global_bns = 0; // for debugging only
|
||||
|
||||
#define smem_lt_2(a, b) ((a).start == (b).start ? (a).end < (b).end : (a).start < (b).start)
|
||||
KSORT_INIT(mem_smem_sort_lt, mem_t, smem_lt_2)
|
||||
|
||||
mem_opt_t *mem_opt_init()
|
||||
{
|
||||
mem_opt_t *o;
|
||||
|
|
@ -1264,11 +1268,11 @@ static void smem_aux_destroy(smem_aux_t *a)
|
|||
}
|
||||
|
||||
// 初始化线程需要的数据
|
||||
mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac)
|
||||
mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const ERT *ert, const bntseq_t *bns, const uint8_t *pac, int useERT)
|
||||
{
|
||||
int i = opt->n_threads, j;
|
||||
mem_worker_t *w = calloc(1, sizeof(mem_worker_t));
|
||||
w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt;
|
||||
w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt; w->ert = ert; w->useERT = useERT;
|
||||
w->calc_isize = 0; w->n = 0; w->regs = 0;
|
||||
w->aux = malloc(i * sizeof(smem_aux_t));
|
||||
w->smem_arr = malloc(i * sizeof(smem_v *));
|
||||
|
|
@ -1509,7 +1513,6 @@ void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
|
|||
bseq1_t *seq_arr, int nseq, smem_aux_t *aux, smem_v *smem_arr, mem_chain_v *chain_arr, mem_alnreg_v *reg_arr,
|
||||
int calc_isize, int64_t l_pac, uint64_v *isize, int tid)
|
||||
{
|
||||
|
||||
int i, j, l_seq;
|
||||
mem_chain_v *chnp;
|
||||
mem_alnreg_v *regp;
|
||||
|
|
@ -1602,13 +1605,270 @@ void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void ert_generate_chain(const mem_opt_t *opt, const bntseq_t *bns, int len, const uint8_t *seq,
|
||||
mem_v *smems, mem_chain_v *chain, u64v *hits, int tid)
|
||||
{
|
||||
int i, b = 0, e = 0, l_rep = 0;
|
||||
int64_t l_pac = bns->l_pac;
|
||||
kbtree_t(chn) * tree;
|
||||
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
||||
tree = kb_init(chn, KB_DEFAULT_SIZE + 8); // +8, due to addition of counters in chain
|
||||
for (i = 0, b = e = l_rep = 0; i < smems->n; ++i) { // compute frac_rep
|
||||
mem_t *p = &smems->a[i];
|
||||
int sb = p->start, se = p->end;
|
||||
if (p->hitcount <= opt->max_occ) continue;
|
||||
if (sb > e) l_rep += e - b, b = sb, e = se;
|
||||
else e = e > se? e : se;
|
||||
}
|
||||
l_rep += e - b;
|
||||
|
||||
for (i = 0; i < smems->n; ++i) {
|
||||
mem_t *p = &smems->a[i];
|
||||
int step, count, slen = p->end - p->start; // seed length
|
||||
int64_t k;
|
||||
step = p->hitcount > opt->max_occ ? p->hitcount / opt->max_occ : 1;
|
||||
for (k = count = 0; k < p->hitcount && count < opt->max_occ; k += step, ++count) {
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
mem_seed_t s;
|
||||
int rid, to_add = 0;
|
||||
if (p->forward || p->fetch_leaves) {
|
||||
s.rbeg = tmp.pos = hits->a[p->hitbeg + k]; // this is the base coordinate in the forward-reverse reference
|
||||
}
|
||||
else {
|
||||
// Hit was obtained by backward search and corresponds to location of reverse complemented SMEM
|
||||
// Add correction to hit position to get locations of SMEM.
|
||||
s.rbeg = tmp.pos = (bns->l_pac << 1) - (hits->a[p->hitbeg + k] + slen - p->end_correction);
|
||||
}
|
||||
s.qbeg = p->start;
|
||||
s.len = p->end - p->start;
|
||||
s.score = s.len;
|
||||
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
|
||||
if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!!
|
||||
if (kb_size(tree)) {
|
||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1;
|
||||
}
|
||||
else to_add = 1;
|
||||
if (to_add) { // add the seed as a new chain
|
||||
tmp.n = 1;
|
||||
tmp.m = 4;
|
||||
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
|
||||
tmp.seeds[0] = s;
|
||||
tmp.rid = rid;
|
||||
tmp.is_alt = !!bns->anns[rid].is_alt;
|
||||
kb_putp(chn, tree, &tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (chain->m < kb_size(tree)) {
|
||||
kv_resize(mem_chain_t, *chain, kb_size(tree));
|
||||
}
|
||||
|
||||
#define traverse_func(p_) (chain->a[chain->n++] = *(p_))
|
||||
__kb_traverse(mem_chain_t, tree, traverse_func);
|
||||
#undef traverse_func
|
||||
|
||||
for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len;
|
||||
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
||||
kb_destroy(chn, tree);
|
||||
}
|
||||
|
||||
void ert_smem_chain(const mem_opt_t *opt, const ERT *ert, const bntseq_t *bns, const uint8_t *pac,
|
||||
int len, const uint8_t *seq, smem_aux_t *aux, mem_chain_v *chn, int tid)
|
||||
{
|
||||
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
||||
if (len > ERT_MAX_READ_LEN)
|
||||
{
|
||||
fprintf(stderr, "Your dataset has reads with length %d. But the ERT index built was for reads with length <= %d. "
|
||||
"Please set ERT_MAX_READ_LEN to the maximum read length in your dataset and rebuild the ERT index. "
|
||||
"Default ERT index supports mapping of reads with length <= %d bp\n", len, ERT_MAX_READ_LEN, ERT_MAX_READ_LEN);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
mem_v *smems = calloc(1, sizeof(mem_v)); kv_init(*smems);
|
||||
u64v *hits = calloc(1, sizeof(u64v)); kv_init(*hits);
|
||||
// int64_t seedBufSize = 256;
|
||||
// mem_seed_t *seedBuf = malloc(sizeof(mem_seed_t) * seedBufSize);
|
||||
// int64_t seedBufCount = 0;
|
||||
|
||||
int hasN = 0;
|
||||
int i;
|
||||
uint8_t unpacked_rc_queue_buf[ERT_MAX_READ_LEN];
|
||||
|
||||
|
||||
for (i = 0; i < len; ++i) {
|
||||
if (seq[i] < 4) {
|
||||
unpacked_rc_queue_buf[len - i - 1] = 3 - seq[i];
|
||||
} else {
|
||||
hasN = 1;
|
||||
unpacked_rc_queue_buf[len - i - 1] = 4;
|
||||
}
|
||||
}
|
||||
|
||||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||||
index_aux_t iaux;
|
||||
iaux.kmer_offsets = ert->kmer_offsets;
|
||||
iaux.mlt_table = ert->mlt_table;
|
||||
iaux.bns = bns;
|
||||
iaux.pac = pac;
|
||||
iaux.ref_string = ert->bref;
|
||||
|
||||
read_aux_t raux;
|
||||
raux.min_seed_len = opt->min_seed_len;
|
||||
raux.l_seq = len;
|
||||
raux.unpacked_queue_buf = (uint8_t *)seq;
|
||||
raux.unpacked_rc_queue_buf = unpacked_rc_queue_buf;
|
||||
|
||||
hits->n = 0;
|
||||
|
||||
// seed-1
|
||||
PROF_START(seed_1);
|
||||
if (hasN) {
|
||||
get_seeds(&iaux, &raux, smems, hits);
|
||||
}
|
||||
else {
|
||||
get_seeds_prefix(&iaux, &raux, smems, hits);
|
||||
}
|
||||
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
||||
// fprintf(stderr, "hits: %ld\n", hits->n);
|
||||
|
||||
// seed-2
|
||||
PROF_START(seed_2);
|
||||
int old_n = smems->n;
|
||||
for (i = 0; i < old_n; ++i) {
|
||||
int qbeg = smems->a[i].start;
|
||||
int qend = smems->a[i].end;
|
||||
if ((qend - qbeg) < split_len || smems->a[i].hitcount > opt->split_width) {
|
||||
continue;
|
||||
}
|
||||
if (hasN) {
|
||||
reseed(&iaux, &raux, smems,
|
||||
(qbeg + qend) >> 1, smems->a[i].hitcount + 1,
|
||||
&smems->a[i].pt, hits);
|
||||
}
|
||||
else {
|
||||
reseed_prefix(&iaux, &raux, smems,
|
||||
(qbeg + qend) >> 1, smems->a[i].hitcount + 1,
|
||||
&smems->a[i].pt, hits);
|
||||
}
|
||||
}
|
||||
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
||||
|
||||
// seed-3
|
||||
PROF_START(seed_3);
|
||||
last(&iaux, &raux, smems, opt->max_mem_intv, hits);
|
||||
PROF_END(tprof[T_SEED_3][tid], seed_3);
|
||||
|
||||
// sort
|
||||
ks_introsort(mem_smem_sort_lt, smems->n, smems->a);
|
||||
|
||||
// chain
|
||||
PROF_START(chain_all);
|
||||
chn->n = 0;
|
||||
PROF_START(gen_chain);
|
||||
ert_generate_chain(opt, bns, len, seq,
|
||||
smems, chn, hits, tid);
|
||||
PROF_END(tprof[T_GEN_CHAIN][tid], gen_chain);
|
||||
PROF_START(flt_chain);
|
||||
chn->n = mem_chain_flt(opt, chn->n, chn->a);
|
||||
PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain);
|
||||
PROF_START(flt_chained_seeds);
|
||||
mem_flt_chained_seeds(opt, bns, pac, len, (uint8_t *)seq, chn->n, chn->a, aux->seq_buf);
|
||||
PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds);
|
||||
PROF_END(tprof[T_CHAIN_ALL][tid], chain_all);
|
||||
kv_destroy(*smems); free(smems);
|
||||
kv_destroy(*hits); free(hits);
|
||||
}
|
||||
|
||||
// 使用ERT作为seeding阶段索引
|
||||
void ert_mem_core_process(const mem_opt_t *opt, const ERT *ert, const bntseq_t *bns, const uint8_t *pac,
|
||||
bseq1_t *seq_arr, int nseq, smem_aux_t *aux, smem_v *smem_arr, mem_chain_v *chain_arr, mem_alnreg_v *reg_arr,
|
||||
int calc_isize, int64_t l_pac, uint64_v *isize, int tid)
|
||||
{
|
||||
int i, j, l_seq;
|
||||
mem_chain_v *chnp;
|
||||
mem_alnreg_v *regp;
|
||||
char *seq;
|
||||
|
||||
// 1. seeding and 2. chain
|
||||
PROF_START(seed_all);
|
||||
for (i = 0; i < nseq; ++i) {
|
||||
seq = seq_arr[i].seq;
|
||||
l_seq = seq_arr[i].l_seq;
|
||||
for (j = 0; j < l_seq; ++j) {
|
||||
seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]];
|
||||
}
|
||||
ert_smem_chain(opt, ert, bns, pac, l_seq, (uint8_t *)seq, aux, &chain_arr[i], tid);
|
||||
}
|
||||
PROF_END(tprof[T_SEED_ALL][tid], seed_all);
|
||||
|
||||
// 3. align
|
||||
PROF_START(aln_all);
|
||||
for (i=0; i<nseq; ++i) {
|
||||
seq = seq_arr[i].seq;
|
||||
l_seq = seq_arr[i].l_seq;
|
||||
chnp = chain_arr + i;
|
||||
regp = reg_arr + i;
|
||||
kv_init(*regp);
|
||||
|
||||
for(j=0; j<chnp->n; ++j) {
|
||||
mem_chain_t *p = &chnp->a[j];
|
||||
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", j);
|
||||
mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux, tid);
|
||||
free(chnp->a[j].seeds);
|
||||
}
|
||||
|
||||
free(chnp->a); chnp->m = 0; chnp->a = 0;
|
||||
regp->n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t *)seq, regp->n, regp->a);
|
||||
if (bwa_verbose >= 4) {
|
||||
err_printf("* %ld chains remain after removing duplicated chains\n", regp->n);
|
||||
for (j = 0; j < regp->n; ++j) {
|
||||
mem_alnreg_t *p = ®p->a[j];
|
||||
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
|
||||
}
|
||||
}
|
||||
for (j = 0; j < regp->n; ++j) {
|
||||
mem_alnreg_t *p = ®p->a[j];
|
||||
if (p->rid >= 0 && bns->anns[p->rid].is_alt)
|
||||
p->is_alt = 1;
|
||||
}
|
||||
}
|
||||
PROF_END(tprof[T_ALN_ALL][tid], aln_all);
|
||||
|
||||
#if 1
|
||||
// 4. calc insert size
|
||||
|
||||
#define MIN_RATIO 0.8
|
||||
if (calc_isize) {
|
||||
PROF_START(ins_size);
|
||||
for (i = 0; i < nseq>>1; ++i) {
|
||||
int dir;
|
||||
int64_t is;
|
||||
mem_alnreg_v *r[2];
|
||||
r[0] = (mem_alnreg_v*)®_arr[i<<1|0];
|
||||
r[1] = (mem_alnreg_v*)®_arr[i<<1|1];
|
||||
if (r[0]->n == 0 || r[1]->n == 0) continue;
|
||||
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
|
||||
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
|
||||
if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
|
||||
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
|
||||
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
|
||||
}
|
||||
PROF_END(tprof[T_INS_SIZE][tid], ins_size);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// 找smem,生成chain,然后align
|
||||
static void worker_smem_align(void *data, int i, int tid)
|
||||
{
|
||||
mem_worker_t *w = (mem_worker_t *)data;
|
||||
int start = i * w->opt->batch_size;
|
||||
int end = MIN(start + w->opt->batch_size, w->n_reads);
|
||||
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
||||
if (w->useERT)
|
||||
ert_mem_core_process(w->opt, w->ert, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
||||
else
|
||||
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
||||
}
|
||||
|
||||
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
|
||||
|
|
@ -1631,7 +1891,7 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed
|
|||
w->n_reads = n; w->pes = &pes[0];
|
||||
|
||||
#if 1
|
||||
if ((opt->flag & MEM_F_PE) && !pes0) { // infer insert sizes if not provided
|
||||
if ((opt->flag & MEM_F_PE) && !pes0) { // infer insert sizes if not provided
|
||||
int i, j;
|
||||
w->calc_isize = 1;
|
||||
for (i = 0; i < opt->n_threads; ++i)
|
||||
|
|
|
|||
4
bwamem.h
4
bwamem.h
|
|
@ -157,6 +157,7 @@ typedef struct {
|
|||
const mem_opt_t *opt;
|
||||
const bwt_t *bwt;
|
||||
const FMTIndex *fmt;
|
||||
const ERT *ert;
|
||||
const bntseq_t *bns;
|
||||
const uint8_t *pac;
|
||||
const mem_pestat_t *pes;
|
||||
|
|
@ -170,12 +171,13 @@ typedef struct {
|
|||
int64_t n_processed;
|
||||
int64_t n;
|
||||
int64_t n_reads;
|
||||
int useERT;
|
||||
} mem_worker_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac);
|
||||
mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const ERT *ert, const bntseq_t *bns, const uint8_t *pac, int useERT);
|
||||
|
||||
smem_i *smem_itr_init(const bwt_t *bwt);
|
||||
void smem_itr_destroy(smem_i *itr);
|
||||
|
|
|
|||
297
bwashm.c
297
bwashm.c
|
|
@ -8,15 +8,16 @@
|
|||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include "bwa.h"
|
||||
#include "ertindex.h"
|
||||
|
||||
int bwa_shm_stage(bwaidx_t *idx, const char *hint, int useERT, const char *_tmpfn)
|
||||
int bwa_shm_stage(bwaidx_t *idx, const char *hint, int useERT)
|
||||
{
|
||||
|
||||
const char *name;
|
||||
uint8_t *shm, *shm_idx;
|
||||
uint16_t *cnt;
|
||||
int shmid, to_init = 0, l;
|
||||
char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn;
|
||||
char path[PATH_MAX + 1], *tmpfn = 0;
|
||||
|
||||
if (hint == 0 || hint[0] == 0) return -1;
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||
|
|
@ -121,9 +122,16 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, int useERT, const char *_tmpf
|
|||
bwaidx_t *bwa_ertidx_load_from_shm(const char *hint_)
|
||||
{
|
||||
char hint[PATH_MAX];
|
||||
char fn[PATH_MAX];
|
||||
FILE *fp;
|
||||
sprintf(hint, "%s.ert", hint_);
|
||||
INIT_SHM_LOAD;
|
||||
bwa_mem2ertidx(l_mem, shm_idx, idx);
|
||||
idx->ert->bref = (uint8_t *)malloc(idx->ert->bref_size * sizeof(uint8_t));
|
||||
sprintf(fn, "%s.ert.0123", hint_);
|
||||
fp = xopen(fn, "rb");
|
||||
err_fread_noeof(idx->ert->bref, sizeof(uint8_t), idx->ert->bref_size, fp);
|
||||
err_fclose(fp);
|
||||
idx->is_shm = 1;
|
||||
return idx;
|
||||
}
|
||||
|
|
@ -228,23 +236,283 @@ int bwa_shm_destroy(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bwa_shm_stage_ert(const char *idx_prefix)
|
||||
{
|
||||
const char *name;
|
||||
uint8_t *shm, *mem;
|
||||
uint16_t *cnt;
|
||||
int shmid, to_init = 0, l;
|
||||
char path[PATH_MAX], ert_prefix[PATH_MAX];
|
||||
int64_t l_mem = 0, x = 0, k = 0;
|
||||
char fn[PATH_MAX];
|
||||
FILE *fp;
|
||||
ERT ert;
|
||||
bntseq_t *bns;
|
||||
int i;
|
||||
|
||||
// clac l_mem
|
||||
// ert
|
||||
x = sizeof(ERT); l_mem += x;
|
||||
sprintf(fn, "%s.ert.mlt.table", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, 0L, SEEK_END);
|
||||
ert.mlt_size = ftell(fp);
|
||||
err_fclose(fp);
|
||||
// multi-level tree
|
||||
x = ert.mlt_size; l_mem += x;
|
||||
// kmer entry
|
||||
ert.kmer_size = numKmers * sizeof(uint64_t);
|
||||
x = ert.kmer_size; l_mem += x;
|
||||
// binary ref
|
||||
sprintf(fn, "%s.ert.0123", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
fseek(fp, 0L, SEEK_END);
|
||||
ert.bref_size = ftell(fp);
|
||||
// x = ert.bref_size; l_mem += x;
|
||||
err_fclose(fp);
|
||||
// bns
|
||||
x = sizeof(bntseq_t); l_mem += x;
|
||||
bns = bns_restore(idx_prefix);
|
||||
x = bns->n_holes * sizeof(bntamb1_t); l_mem += x;
|
||||
x = bns->n_seqs * sizeof(bntann1_t); l_mem += x;
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
x = strlen(bns->anns[i].name) + 1; l_mem += x;
|
||||
x = strlen(bns->anns[i].anno) + 1; l_mem += x;
|
||||
}
|
||||
// pac
|
||||
x = bns->l_pac / 4 + 1; l_mem += x;
|
||||
|
||||
fprintf(stderr, "l_mem: %ld\n", l_mem);
|
||||
|
||||
sprintf(ert_prefix, "%s.ert", idx_prefix);
|
||||
for (name = ert_prefix + strlen(ert_prefix) - 1; name >= ert_prefix && *name != '/'; --name);
|
||||
++name;
|
||||
if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
|
||||
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
|
||||
to_init = 1;
|
||||
}
|
||||
if (shmid < 0) return -1;
|
||||
if (ftruncate(shmid, BWA_CTL_SIZE) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (to_init) {
|
||||
memset(shm, 0, BWA_CTL_SIZE);
|
||||
cnt[1] = 4;
|
||||
}
|
||||
strcat(strcpy(path, "/bwaidx-"), name);
|
||||
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
|
||||
shm_unlink(path);
|
||||
perror("shm_open()");
|
||||
return -1;
|
||||
}
|
||||
l = 8 + strlen(name) + 1;
|
||||
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
|
||||
memcpy(shm + cnt[1], &l_mem, 8);
|
||||
memcpy(shm + cnt[1] + 8, name, l - 8);
|
||||
cnt[1] += l; ++cnt[0];
|
||||
if (ftruncate(shmid, l_mem) < 0) return -1;
|
||||
mem = mmap(0, l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
|
||||
// write to share mem
|
||||
// ert
|
||||
x = sizeof(ERT);
|
||||
memcpy(mem, &ert, x); k = x;
|
||||
// multi-level tree
|
||||
sprintf(fn, "%s.ert.mlt.table", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
x = ert.mlt_size;
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
err_fclose(fp);
|
||||
// kmer entry
|
||||
sprintf(fn, "%s.ert.kmer.table", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
x = ert.kmer_size;
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
err_fclose(fp);
|
||||
// binary ref
|
||||
// sprintf(fn, "%s.ert.0123", idx_prefix);
|
||||
// fp = xopen(fn, "rb");
|
||||
// x = ert.bref_size;
|
||||
// fread_fix(fp, x, mem + k); k += x;
|
||||
// err_fclose(fp);
|
||||
// bns
|
||||
x = sizeof(bntseq_t); memcpy(mem + k, bns, x); k += x;
|
||||
x = bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, bns->ambs, x); k += x;
|
||||
x = bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, bns->anns, x); k += x;
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
x = strlen(bns->anns[i].name) + 1; memcpy(mem + k, bns->anns[i].name, x); k += x;
|
||||
x = strlen(bns->anns[i].anno) + 1; memcpy(mem + k, bns->anns[i].anno, x); k += x;
|
||||
}
|
||||
// pac
|
||||
x = bns->l_pac / 4 + 1;
|
||||
err_fread_noeof(mem + k, 1, x, bns->fp_pac); k += x;// concatenated 2-bit encoded sequence
|
||||
err_fclose(bns->fp_pac);
|
||||
|
||||
fprintf(stderr, "k: %ld\n", k);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_shm_stage_fmt(const char *idx_prefix)
|
||||
{
|
||||
const char *name;
|
||||
uint8_t *shm, *mem;
|
||||
uint16_t *cnt;
|
||||
int shmid, to_init = 0, l;
|
||||
char path[PATH_MAX];
|
||||
int64_t l_mem = 0, x = 0, k = 0;
|
||||
char fn[PATH_MAX];
|
||||
FILE *fp;
|
||||
FMTIndex fmt;
|
||||
bntseq_t *bns;
|
||||
int i;
|
||||
// clac l_mem
|
||||
// fmt
|
||||
x = sizeof(FMTIndex); l_mem += x;
|
||||
sprintf(fn, "%s.fmt", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, 0, SEEK_END);
|
||||
x = ftell(fp) - sizeof(bwtint_t) * 6 - 3; l_mem += x;
|
||||
fmt.bwt_size = x >> 2;
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
err_fread_noeof(&fmt.primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&fmt.sec_primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&fmt.sec_bcp, sizeof(uint8_t), 1, fp);
|
||||
err_fread_noeof(&fmt.first_base, sizeof(uint8_t), 1, fp);
|
||||
err_fread_noeof(&fmt.last_base, sizeof(uint8_t), 1, fp);
|
||||
err_fread_noeof(fmt.L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||
fmt.seq_len = fmt.L2[4];
|
||||
fmt_gen_cnt_occ(&fmt);
|
||||
err_fclose(fp);
|
||||
// sa
|
||||
sprintf(fn, "%s.bytesa", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, sizeof(bwtint_t) * 5, SEEK_SET);
|
||||
err_fread_noeof(&fmt.sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
fmt.n_sa = (fmt.seq_len + fmt.sa_intv) / fmt.sa_intv;
|
||||
x = SA_BYTES(fmt.n_sa); l_mem += x;
|
||||
err_fclose(fp);
|
||||
// kmaer hash
|
||||
x = (1 << (10 << 1)) * sizeof(KmerEntryArr)
|
||||
+ (1 << (11 << 1)) * sizeof(KmerEntry)
|
||||
+ (1 << (12 << 1)) * sizeof(KmerEntry);
|
||||
#if HASH_KMER_LEN > 12
|
||||
x += (1 << (13 << 1)) * sizeof(KmerEntry);
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
x += (1 << (14 << 1)) * sizeof(KmerEntry);
|
||||
#endif
|
||||
l_mem += x;
|
||||
// bns
|
||||
x = sizeof(bntseq_t); l_mem += x;
|
||||
bns = bns_restore(idx_prefix);
|
||||
x = bns->n_holes * sizeof(bntamb1_t); l_mem += x;
|
||||
x = bns->n_seqs * sizeof(bntann1_t); l_mem += x;
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
x = strlen(bns->anns[i].name) + 1; l_mem += x;
|
||||
x = strlen(bns->anns[i].anno) + 1; l_mem += x;
|
||||
}
|
||||
// pac
|
||||
x = bns->l_pac / 4 + 1; l_mem += x;
|
||||
fmt.l_pac = bns->l_pac;
|
||||
|
||||
fprintf(stderr, "l_mem: %ld\n", l_mem);
|
||||
|
||||
for (name = idx_prefix + strlen(idx_prefix) - 1; name >= idx_prefix && *name != '/'; --name) ;
|
||||
++name;
|
||||
if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
|
||||
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
|
||||
to_init = 1;
|
||||
}
|
||||
if (shmid < 0) return -1;
|
||||
if (ftruncate(shmid, BWA_CTL_SIZE) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (to_init) {
|
||||
memset(shm, 0, BWA_CTL_SIZE);
|
||||
cnt[1] = 4;
|
||||
}
|
||||
strcat(strcpy(path, "/bwaidx-"), name);
|
||||
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
|
||||
shm_unlink(path);
|
||||
perror("shm_open()");
|
||||
return -1;
|
||||
}
|
||||
l = 8 + strlen(name) + 1;
|
||||
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
|
||||
memcpy(shm + cnt[1], &l_mem, 8);
|
||||
memcpy(shm + cnt[1] + 8, name, l - 8);
|
||||
cnt[1] += l; ++cnt[0];
|
||||
if (ftruncate(shmid, l_mem) < 0) return -1;
|
||||
mem = mmap(0, l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
|
||||
// write to share mem
|
||||
|
||||
// fmt
|
||||
x = sizeof(FMTIndex);
|
||||
memcpy(mem, &fmt, x); k = x;
|
||||
sprintf(fn, "%s.fmt", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, sizeof(bwtint_t) * 6 + 3, SEEK_SET);
|
||||
x = fmt.bwt_size * 4;
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
err_fclose(fp);
|
||||
// sa
|
||||
sprintf(fn, "%s.bytesa", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, sizeof(bwtint_t) * 7, SEEK_SET);
|
||||
x = SA_BYTES(fmt.n_sa);
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
err_fclose(fp);
|
||||
// kmer hash
|
||||
sprintf(fn, "%s.kmer", idx_prefix);
|
||||
fp = xopen(fn, "rb");
|
||||
x = (1 << (10 << 1)) * sizeof(KmerEntryArr);
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
x = (1 << (11 << 1)) * sizeof(KmerEntry);
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
x = (1 << (12 << 1)) * sizeof(KmerEntry);
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
#if HASH_KMER_LEN > 12
|
||||
x = (1 << (13 << 1)) * sizeof(KmerEntry);
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
x = (1 << (14 << 1)) * sizeof(KmerEntry);
|
||||
fread_fix(fp, x, mem + k); k += x;
|
||||
#endif
|
||||
err_fclose(fp);
|
||||
// bns
|
||||
x = sizeof(bntseq_t); memcpy(mem + k, bns, x); k += x;
|
||||
x = bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, bns->ambs, x); k += x;
|
||||
x = bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, bns->anns, x); k += x;
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
x = strlen(bns->anns[i].name) + 1; memcpy(mem + k, bns->anns[i].name, x); k += x;
|
||||
x = strlen(bns->anns[i].anno) + 1; memcpy(mem + k, bns->anns[i].anno, x); k += x;
|
||||
}
|
||||
// pac
|
||||
x = bns->l_pac / 4 + 1;
|
||||
err_fread_noeof(mem + k, 1, x, bns->fp_pac); k += x;// concatenated 2-bit encoded sequence
|
||||
err_fclose(bns->fp_pac);
|
||||
|
||||
fprintf(stderr, "k: %ld\n", k);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_shm(int argc, char *argv[])
|
||||
{
|
||||
int c, to_list = 0, to_drop = 0, ret = 0, useERT = 0;
|
||||
char *tmpfn = 0;
|
||||
char shm_prefix[PATH_MAX];
|
||||
while ((c = getopt(argc, argv, "ldf:Z")) >= 0)
|
||||
while ((c = getopt(argc, argv, "ldZ")) >= 0)
|
||||
{
|
||||
if (c == 'l') to_list = 1;
|
||||
else if (c == 'd') to_drop = 1;
|
||||
else if (c == 'f') tmpfn = optarg;
|
||||
else if (c == 'Z') useERT = 1;
|
||||
}
|
||||
if (optind == argc && !to_list && !to_drop) {
|
||||
fprintf(stderr, "\nUsage: fastbwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
|
||||
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||||
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n");
|
||||
fprintf(stderr, " -Z use Ert as seeding index\n\n");
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -259,16 +527,31 @@ int main_shm(int argc, char *argv[])
|
|||
sprintf(shm_prefix, "%s", argv[optind]);
|
||||
if (bwa_shm_test(shm_prefix) == 0)
|
||||
{
|
||||
#if 0
|
||||
bwaidx_t *idx;
|
||||
if (useERT)
|
||||
idx = bwa_ertidx_load_from_disk(argv[optind]);
|
||||
else
|
||||
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC | BWA_IDX_FMT);
|
||||
if (bwa_shm_stage(idx, shm_prefix, useERT, tmpfn) < 0) {
|
||||
if (bwa_shm_stage(idx, shm_prefix, useERT) < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||
ret = 1;
|
||||
}
|
||||
bwa_idx_destroy(idx);
|
||||
#else
|
||||
if (useERT) {
|
||||
if (bwa_shm_stage_ert(argv[optind]) < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (bwa_shm_stage_fmt(argv[optind]) < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
|
||||
|
|
|
|||
51
bwtindex.c
51
bwtindex.c
|
|
@ -146,6 +146,57 @@ int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bwa_pac2bref(int argc, char *argv[])
|
||||
{
|
||||
FILE *fp_pac, *fp_bref;
|
||||
uint64_t pac_size, pac_len, seq_len, bref_size;
|
||||
int64_t i;
|
||||
uint8_t *pac_buf, *bref_buf;
|
||||
uint8_t c;
|
||||
char pac_file_name[PATH_MAX];
|
||||
char binary_ref_name[PATH_MAX];
|
||||
if (optind + 1 > argc)
|
||||
{
|
||||
fprintf(stderr, "Usage: fastbwa pac2ref <index_prefix>\n");
|
||||
return 1;
|
||||
}
|
||||
sprintf(pac_file_name, "%s.pac", argv[optind]);
|
||||
sprintf(binary_ref_name, "%s.ert.0123", argv[optind]);
|
||||
|
||||
fp_pac = xopen(pac_file_name, "rb");
|
||||
err_fseek(fp_pac, -1, SEEK_END);
|
||||
pac_len = err_ftell(fp_pac);
|
||||
err_fread_noeof(&c, 1, 1, fp_pac);
|
||||
seq_len = (pac_len - 1) * 4 + (int)c;
|
||||
|
||||
err_fseek(fp_pac, 0, SEEK_SET);
|
||||
pac_size = (seq_len >> 2) + ((seq_len & 3) == 0 ? 0 : 1);
|
||||
pac_buf = (uint8_t *)calloc(pac_size, 1);
|
||||
err_fread_noeof(pac_buf, 1, pac_size, fp_pac);
|
||||
err_fclose(fp_pac);
|
||||
|
||||
bref_size = seq_len * 2;
|
||||
bref_buf = (uint8_t *)malloc(bref_size);
|
||||
for (i = 0; i < seq_len; ++i) {
|
||||
int nt = pac_buf[i >> 2] >> ((3 - (i & 3)) << 1) & 3;
|
||||
bref_buf[i] = nt;
|
||||
}
|
||||
|
||||
for (i = seq_len - 1; i >= 0; --i) {
|
||||
int64_t j = bref_size - i - 1;
|
||||
bref_buf[j] = 3 - bref_buf[i];
|
||||
}
|
||||
|
||||
fp_bref = xopen(binary_ref_name, "wb");
|
||||
err_fwrite(bref_buf, sizeof(uint8_t), bref_size, fp_bref);
|
||||
|
||||
free(bref_buf);
|
||||
free(pac_buf);
|
||||
err_fclose(fp_bref);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt)
|
||||
|
|
|
|||
|
|
@ -3031,7 +3031,7 @@ void get_seeds_prefix(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, u64v* h
|
|||
sh.prev_prev_pivot = sh.prev_pivot;
|
||||
sh.prev_pivot = rm.start;
|
||||
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||||
break; // zzh
|
||||
// break; // zzh
|
||||
}
|
||||
#ifdef PRINT_SMEM
|
||||
ks_introsort(mem_smem_sort_lt_ert, smems->n, smems->a); // Sort SMEMs based on start pos in read. For DEBUG.
|
||||
|
|
@ -3167,7 +3167,7 @@ void get_seeds(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, u64v* hits) {
|
|||
sh.prev_prev_pivot = sh.prev_pivot;
|
||||
sh.prev_pivot = rm.start;
|
||||
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||||
break; // zzh
|
||||
// break; // zzh
|
||||
}
|
||||
#ifdef PRINT_SMEM
|
||||
ks_introsort(mem_smem_sort_lt_ert, smems->n, smems->a); // Sort SMEMs based on start pos in read. For DEBUG.
|
||||
|
|
|
|||
|
|
@ -78,7 +78,6 @@ typedef struct {
|
|||
long read_idx;
|
||||
long calc_idx;
|
||||
long write_idx;
|
||||
int useERT;
|
||||
} ktp_aux_t;
|
||||
|
||||
// read
|
||||
|
|
@ -636,8 +635,7 @@ int main_mem(int argc, char *argv[])
|
|||
}
|
||||
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %ld\n", aux.idx->fmt->L2[0], aux.idx->fmt->L2[1], aux.idx->fmt->L2[2], aux.idx->fmt->L2[3], aux.idx->fmt->L2[4]);
|
||||
aux.useERT = useERT;
|
||||
aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->bns, aux.idx->pac);
|
||||
aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->ert, aux.idx->bns, aux.idx->pac, useERT);
|
||||
aux.data = calloc(2, sizeof(ktp_data_t));
|
||||
|
||||
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
|
||||
|
|
|
|||
|
|
@ -117,4 +117,5 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k);
|
||||
bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k);
|
||||
|
||||
void fmt_gen_cnt_occ(FMTIndex *fmt);
|
||||
#endif
|
||||
8
ksw.c
8
ksw.c
|
|
@ -159,7 +159,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
|
|||
_mm_store_si128(Hmax + i, zero);
|
||||
}
|
||||
// the core loop
|
||||
PROF_START(loop);
|
||||
// PROF_START(loop);
|
||||
for (i = 0; i < tlen; ++i) {
|
||||
int j, k, imax;
|
||||
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
||||
|
|
@ -223,8 +223,8 @@ end_loop16:
|
|||
}
|
||||
S = H1; H1 = H0; H0 = S; // swap H0 and H1
|
||||
}
|
||||
PROF_END(gprof[G_KSW_LOOP], loop);
|
||||
PROF_START(end_loop);
|
||||
// PROF_END(gprof[G_KSW_LOOP], loop);
|
||||
// PROF_START(end_loop);
|
||||
r.score = gmax + q->shift < 255? gmax : 255;
|
||||
r.te = te;
|
||||
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
|
||||
|
|
@ -245,7 +245,7 @@ end_loop16:
|
|||
}
|
||||
}
|
||||
free(b);
|
||||
PROF_END(gprof[G_KSW_END_LOOP], end_loop);
|
||||
// PROF_END(gprof[G_KSW_END_LOOP], end_loop);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
|
|||
3
main.c
3
main.c
|
|
@ -35,6 +35,7 @@
|
|||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
int bwa_pac2bwt(int argc, char *argv[]);
|
||||
int bwa_pac2bref(int argc, char *argv[]);
|
||||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
int bwa_bwt2bytesa(int argc, char *argv[]);
|
||||
|
|
@ -78,6 +79,7 @@ static int usage()
|
|||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
fprintf(stderr, " pac2bref generate binary reference from PAC\n");
|
||||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
|
||||
|
|
@ -107,6 +109,7 @@ int main(int argc, char *argv[])
|
|||
if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2bref") == 0) ret = bwa_pac2bref(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ int display_stats(int nthreads)
|
|||
find_opt(tprof[T_GEN_CHAIN], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_gen_chain: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_FLT_CHAIN], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_fil_chain: %0.2lf s\n", avg);
|
||||
fprintf(stderr, "time_flt_chain: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_FLT_CHANNED_SEEDS], nthreads, &max, &min, &avg);
|
||||
fprintf(stderr, "time_flt_chained_seeds: %0.2lf s\n", avg);
|
||||
find_opt(tprof[T_SAL], nthreads, &max, &min, &avg);
|
||||
|
|
|
|||
19
run.sh
19
run.sh
|
|
@ -1,17 +1,19 @@
|
|||
thread=1
|
||||
thread=64
|
||||
|
||||
## d1 k18<=4 89%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq
|
||||
n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
|
||||
n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
|
||||
## d2 <= 4 87%
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/na12878_r1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/na12878_r2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
|
||||
|
|
@ -31,8 +33,8 @@ n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq
|
|||
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
|
||||
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
|
||||
#n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq
|
||||
#n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq
|
||||
n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq
|
||||
n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq
|
||||
|
||||
reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
|
||||
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
|
||||
|
|
@ -40,11 +42,12 @@ reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
|
|||
|
||||
#out=~/data1/out-u8-1.sam
|
||||
#out=~/data1/out-i16.sam
|
||||
out=~/data1/fmt-out.sam
|
||||
#out=~/data1/fmt-out.sam
|
||||
out=~/data/fastbwa.ert.sam
|
||||
#out=/dev/null
|
||||
|
||||
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
-Z $reference \
|
||||
$reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
-o $out -2
|
||||
-o $out -2 -Z
|
||||
|
|
|
|||
Loading…
Reference in New Issue