添加了共享内存支持,成功构建ert索引
This commit is contained in:
parent
9c9502d584
commit
364ee9756e
|
|
@ -49,6 +49,18 @@
|
|||
"~/data/reference/human_g1k_v37_decoy.fasta.kmer"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
{
|
||||
"name": "share mem",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/fastbwa",
|
||||
"args": [
|
||||
"shm",
|
||||
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -35,6 +35,11 @@
|
|||
"bitset": "c",
|
||||
"iterator": "c",
|
||||
"memory": "c",
|
||||
"__locale": "c"
|
||||
"__locale": "c",
|
||||
"stdint.h": "c",
|
||||
"bntseq.h": "c",
|
||||
"inttypes.h": "c",
|
||||
"ertindex.h": "c",
|
||||
"ertseeding.h": "c"
|
||||
}
|
||||
}
|
||||
4
Makefile
4
Makefile
|
|
@ -1,6 +1,4 @@
|
|||
CC= gcc
|
||||
#CC= clang --analyze
|
||||
# CFLAGS= -g -Wall -Wno-unused-function -O2
|
||||
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
|
||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
|
||||
|
|
@ -18,7 +16,7 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
|||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o profiling.o \
|
||||
fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o \
|
||||
debug.o
|
||||
debug.o ertindex.o ertseeding.o
|
||||
PROG= fastbwa
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread -ldl
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
#!/bin/bash
|
||||
time ./fastbwa bwt2ert ~/data1/fmt_ref/human_g1k_v37_decoy.fasta -t 64
|
||||
#time ./fastbwa bwt2ert ~/reference/human_g1k_v37_decoy.fasta -t 64
|
||||
278
bwa.c
278
bwa.c
|
|
@ -30,18 +30,20 @@
|
|||
#include <assert.h>
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#include <limits.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "kvec.h"
|
||||
#include "ertindex.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int bwa_verbose = 3;
|
||||
int bwa_verbose = 4;
|
||||
int bwa_dbg = 0;
|
||||
char bwa_rg_id[256];
|
||||
char *bwa_pg;
|
||||
|
|
@ -486,6 +488,87 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
return fmt;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_ertidx_load_from_disk(const char *hint)
|
||||
{
|
||||
bwaidx_t *idx = 0;
|
||||
int i, c;
|
||||
const char *prefix = hint;
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
|
||||
idx->bns = bns_restore(prefix);
|
||||
for (i = c = 0; i < idx->bns->n_seqs; ++i)
|
||||
if (idx->bns->anns[i].is_alt) ++c;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
|
||||
|
||||
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
||||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||
err_fclose(idx->bns->fp_pac);
|
||||
idx->bns->fp_pac = 0;
|
||||
|
||||
// load ert
|
||||
{
|
||||
char kmer_tbl_file_name[PATH_MAX];
|
||||
char ml_tbl_file_name[PATH_MAX];
|
||||
sprintf(kmer_tbl_file_name, "%s.ert.kmer.table", prefix);
|
||||
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
|
||||
FILE *kmer_tbl_fd, *ml_tbl_fd;
|
||||
kmer_tbl_fd = fopen(kmer_tbl_file_name, "rb");
|
||||
if (kmer_tbl_fd == NULL) {
|
||||
fprintf(stderr, "[M::%s::ERT] Can't open k-mer index\n.", __func__);
|
||||
exit(1);
|
||||
}
|
||||
ml_tbl_fd = fopen(ml_tbl_file_name, "rb");
|
||||
if (ml_tbl_fd == NULL) {
|
||||
fprintf(stderr, "[M::%s::ERT] Can't open multi-level tree index\n.", __func__);
|
||||
exit(1);
|
||||
}
|
||||
double ctime, rtime;
|
||||
ctime = cputime();
|
||||
rtime = realtime();
|
||||
int64_t allocMem = numKmers * 8L;
|
||||
|
||||
//
|
||||
// Read k-mer index
|
||||
//
|
||||
idx->ert = (ERT *)calloc(1, sizeof(ERT));
|
||||
idx->ert->kmer_size = numKmers * sizeof(uint64_t);
|
||||
idx->ert->kmer_offsets = (uint64_t *)malloc(idx->ert->kmer_size);
|
||||
assert(idx->ert->kmer_offsets != NULL);
|
||||
if (bwa_verbose >= 3) {
|
||||
fprintf(stderr, "[M::%s::ERT] Reading kmer index to memory\n", __func__);
|
||||
}
|
||||
err_fread_noeof(idx->ert->kmer_offsets, sizeof(uint64_t), numKmers, kmer_tbl_fd);
|
||||
|
||||
//
|
||||
// Read multi-level tree index
|
||||
//
|
||||
fseek(ml_tbl_fd, 0L, SEEK_END);
|
||||
long size = ftell(ml_tbl_fd);
|
||||
idx->ert->mlt_size = size;
|
||||
allocMem += size;
|
||||
idx->ert->mlt_table = (uint8_t *)malloc(size * sizeof(uint8_t));
|
||||
assert(idx->ert->mlt_table != NULL);
|
||||
fseek(ml_tbl_fd, 0L, SEEK_SET);
|
||||
if (bwa_verbose >= 3) {
|
||||
fprintf(stderr, "[M::%s::ERT] Reading multi-level tree index to memory\n", __func__);
|
||||
}
|
||||
err_fread_noeof(idx->ert->mlt_table, sizeof(uint8_t), size, ml_tbl_fd);
|
||||
fclose(kmer_tbl_fd);
|
||||
fclose(ml_tbl_fd);
|
||||
if (bwa_verbose >= 3)
|
||||
{
|
||||
fprintf(stderr, "[M::%s::ERT] Index tables loaded in %.3f CPU sec, %.3f real sec...\n", __func__, cputime() - ctime, realtime() - rtime);
|
||||
}
|
||||
}
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
||||
{
|
||||
bwaidx_t *idx;
|
||||
|
|
@ -497,9 +580,14 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
|||
}
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
|
||||
if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint);
|
||||
//if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||
//idx->bwt->kmer_hash = idx->fmt->kmer_hash;
|
||||
|
||||
if (which & BWA_IDX_FMT) idx->fmt = bwa_idx_load_fmt(hint);
|
||||
if (which & BWA_IDX_BWT)
|
||||
{
|
||||
idx->bwt = bwa_idx_load_bwt(hint);
|
||||
if (which & BWA_IDX_FMT)
|
||||
idx->bwt->kmer_hash = idx->fmt->kmer_hash;
|
||||
}
|
||||
|
||||
if (which & BWA_IDX_BNS)
|
||||
{
|
||||
|
|
@ -515,8 +603,10 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
|||
err_fclose(idx->bns->fp_pac);
|
||||
idx->bns->fp_pac = 0;
|
||||
// 赋值到fmt中对应的pac
|
||||
idx->fmt->l_pac = idx->bns->l_pac;
|
||||
idx->fmt->pac = idx->pac;
|
||||
if (which & BWA_IDX_FMT) {
|
||||
idx->fmt->l_pac = idx->bns->l_pac;
|
||||
idx->fmt->pac = idx->pac;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(prefix);
|
||||
|
|
@ -550,7 +640,7 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
|||
// generate idx->bwt
|
||||
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
|
||||
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
|
||||
x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
||||
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
||||
|
|
@ -567,6 +657,176 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void mem_to_bnspac(bwaidx_t *idx, uint8_t **mem_, int64_t *k_)
|
||||
{
|
||||
int i;
|
||||
int64_t x, k = *k_;
|
||||
uint8_t *mem = *mem_;
|
||||
// generate idx->bns and idx->pac
|
||||
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
||||
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
|
||||
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
|
||||
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
|
||||
}
|
||||
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
|
||||
*k_ = k;
|
||||
*mem_ = mem;
|
||||
}
|
||||
|
||||
int bwa_mem2fmtidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
||||
{
|
||||
int64_t k = 0, x;
|
||||
// generate idx->fmt
|
||||
x = sizeof(FMTIndex); idx->fmt = malloc(x); memcpy(idx->fmt, mem + k, x); k += x;
|
||||
x = idx->fmt->bwt_size * 4; idx->fmt->bwt = (uint32_t*)(mem + k); k += x;
|
||||
x = SA_BYTES(idx->fmt->n_sa); idx->fmt->sa = (uint8_t*)(mem + k); k += x;
|
||||
// kmer hash
|
||||
x = (1 << (10 << 1)) * sizeof(KmerEntryArr);
|
||||
idx->fmt->kmer_hash.ke10 = (KmerEntryArr*)(mem + k); k += x;
|
||||
x = (1 << (11 << 1)) * sizeof(KmerEntry);
|
||||
idx->fmt->kmer_hash.ke11 = (KmerEntry*)(mem + k); k += x;
|
||||
x = (1 << (12 << 1)) * sizeof(KmerEntry);
|
||||
idx->fmt->kmer_hash.ke12 = (KmerEntry*)(mem + k); k += x;
|
||||
#if HASH_KMER_LEN > 12
|
||||
x = (1 << (13 << 1)) * sizeof(KmerEntry);
|
||||
idx->fmt->kmer_hash.ke13 = (KmerEntry*)(mem + k); k += x;
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
x = (1 << (14 << 1)) * sizeof(KmerEntry);
|
||||
idx->fmt->kmer_hash.ke14 = (KmerEntry*)(mem + k); k += x;
|
||||
#endif
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
mem_to_bnspac(idx, &mem, &k);
|
||||
assert(k == l_mem);
|
||||
idx->l_mem = k; idx->mem = mem;
|
||||
idx->fmt->pac = idx->pac;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_mem2ertidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
||||
{
|
||||
int64_t k = 0, x;
|
||||
// generate idx->ert
|
||||
x = sizeof(ERT); idx->ert = malloc(x); memcpy(idx->ert, mem + k, x); k += x;
|
||||
x = idx->ert->mlt_size; idx->ert->mlt_table = (uint8_t *)(mem + k); k += x;
|
||||
x = idx->ert->kmer_size; idx->ert->kmer_offsets = (uint64_t *)(mem + k); k += x;
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
mem_to_bnspac(idx, &mem, &k);
|
||||
assert(k == l_mem);
|
||||
idx->l_mem = k;
|
||||
idx->mem = mem;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void move_bns_to_mem(bwaidx_t *idx, uint8_t **mem_, int64_t *k_)
|
||||
{
|
||||
int i;
|
||||
int64_t x, tmp, k=*k_;
|
||||
uint8_t *mem = *mem_;
|
||||
// copy idx->bns
|
||||
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
||||
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
|
||||
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
|
||||
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
|
||||
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
|
||||
free(idx->bns->ambs);
|
||||
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
|
||||
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
|
||||
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
|
||||
}
|
||||
free(idx->bns->anns);
|
||||
*k_ = k;
|
||||
*mem_ = mem;
|
||||
}
|
||||
|
||||
static void move_pac_to_mem(bwaidx_t *idx, uint8_t **mem_, int64_t *k_)
|
||||
{
|
||||
int64_t x, k = *k_;
|
||||
uint8_t *mem = *mem_;
|
||||
// copy idx->pac
|
||||
x = idx->bns->l_pac/4+1;
|
||||
mem = realloc(mem, k + x);
|
||||
memcpy(mem + k, idx->pac, x); k += x;
|
||||
free(idx->bns); idx->bns = 0;
|
||||
free(idx->pac); idx->pac = 0;
|
||||
*k_ = k;
|
||||
*mem_ = mem;
|
||||
}
|
||||
|
||||
int bwa_ertidx2mem(bwaidx_t *idx)
|
||||
{
|
||||
int64_t x, k;
|
||||
uint8_t *mem;
|
||||
// copy idx->ert
|
||||
x = idx->ert->mlt_size;
|
||||
mem = realloc(idx->ert->mlt_table, sizeof(ERT) + x); idx->ert->mlt_table = 0;
|
||||
memmove(mem + sizeof(ERT), mem, x);
|
||||
memcpy(mem, idx->ert, sizeof(ERT)); k = sizeof(ERT) + x;
|
||||
|
||||
x = idx->ert->kmer_size;
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->ert->kmer_offsets, x); k += x;
|
||||
free(idx->ert->kmer_offsets); idx->ert->kmer_offsets = 0;
|
||||
free(idx->ert);
|
||||
idx->ert = 0;
|
||||
|
||||
// copy idx->bns
|
||||
move_bns_to_mem(idx, &mem, &k);
|
||||
// copy idx->pac
|
||||
move_pac_to_mem(idx, &mem, &k);
|
||||
|
||||
return bwa_mem2ertidx(k, mem, idx);
|
||||
}
|
||||
|
||||
int bwa_fmtidx2mem(bwaidx_t *idx)
|
||||
{
|
||||
int64_t k, x;
|
||||
uint8_t *mem;
|
||||
|
||||
// copy idx->fmt
|
||||
x = idx->fmt->bwt_size * 4;
|
||||
mem = realloc(idx->fmt->bwt, sizeof(FMTIndex) + x); idx->fmt->bwt = 0;
|
||||
memmove(mem + sizeof(FMTIndex), mem, x);
|
||||
memcpy(mem, idx->fmt, sizeof(FMTIndex)); k = sizeof(FMTIndex) + x;
|
||||
x = SA_BYTES(idx->fmt->n_sa); mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->sa, x); k += x;
|
||||
// kmer hash
|
||||
x = (1 << (10 << 1)) * sizeof(KmerEntryArr);
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke10, x); k += x;
|
||||
x = (1 << (11 << 1)) * sizeof(KmerEntry);
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke11, x); k += x;
|
||||
x = (1 << (12 << 1)) * sizeof(KmerEntry);
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke12, x); k += x;
|
||||
#if HASH_KMER_LEN > 12
|
||||
x = (1 << (13 << 1)) * sizeof(KmerEntry);
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke13, x); k += x;
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
x = (1 << (14 << 1)) * sizeof(KmerEntry);
|
||||
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke14, x); k += x;
|
||||
#endif
|
||||
free(idx->fmt->kmer_hash.ke10);
|
||||
free(idx->fmt->kmer_hash.ke11);
|
||||
free(idx->fmt->kmer_hash.ke12);
|
||||
free(idx->fmt->kmer_hash.ke13);
|
||||
free(idx->fmt->kmer_hash.ke14);
|
||||
free(idx->fmt->sa);
|
||||
free(idx->fmt); idx->fmt = 0;
|
||||
|
||||
// copy idx->bns
|
||||
move_bns_to_mem(idx, &mem, &k);
|
||||
// copy idx->pac
|
||||
move_pac_to_mem(idx, &mem, &k);
|
||||
|
||||
return bwa_mem2fmtidx(k, mem, idx);
|
||||
}
|
||||
|
||||
int bwa_idx2mem(bwaidx_t *idx)
|
||||
{
|
||||
int i;
|
||||
|
|
@ -574,14 +834,16 @@ int bwa_idx2mem(bwaidx_t *idx)
|
|||
uint8_t *mem;
|
||||
|
||||
// copy idx->bwt
|
||||
|
||||
x = idx->bwt->bwt_size * 4;
|
||||
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
|
||||
memmove(mem + sizeof(bwt_t), mem, x);
|
||||
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
|
||||
x = SA_BYTES(idx->bwt->n_sa); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
|
||||
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
|
||||
free(idx->bwt->sa);
|
||||
free(idx->bwt); idx->bwt = 0;
|
||||
|
||||
|
||||
// copy idx->bns
|
||||
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
||||
|
|
|
|||
20
bwa.h
20
bwa.h
|
|
@ -36,7 +36,8 @@
|
|||
#define BWA_IDX_BWT 0x1
|
||||
#define BWA_IDX_BNS 0x2
|
||||
#define BWA_IDX_PAC 0x4
|
||||
#define BWA_IDX_ALL 0x7
|
||||
#define BWA_IDX_FMT 0x8
|
||||
#define BWA_IDX_ALL 0xF
|
||||
|
||||
#define BWA_CTL_SIZE 0x10000
|
||||
|
||||
|
|
@ -47,13 +48,21 @@
|
|||
|
||||
#define BWA_DBG_QNAME 0x1
|
||||
|
||||
typedef struct {
|
||||
uint64_t *kmer_offsets; // ert kmer
|
||||
uint8_t *mlt_table; // ert mlt
|
||||
uint64_t kmer_size;
|
||||
uint64_t mlt_size;
|
||||
} ERT;
|
||||
|
||||
typedef struct {
|
||||
bwt_t *bwt; // FM-index
|
||||
FMTIndex *fmt;// FMT-index
|
||||
bntseq_t *bns; // information on the reference sequences
|
||||
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
|
||||
ERT *ert;
|
||||
|
||||
int is_shm;
|
||||
int is_shm;
|
||||
int64_t l_mem;
|
||||
uint8_t *mem;
|
||||
} bwaidx_t;
|
||||
|
|
@ -91,10 +100,17 @@ extern "C" {
|
|||
|
||||
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
|
||||
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
|
||||
bwaidx_t *bwa_fmtidx_load_from_shm(const char *hint);
|
||||
bwaidx_t *bwa_ertidx_load_from_shm(const char *hint);
|
||||
bwaidx_t *bwa_ertidx_load_from_disk(const char *hint);
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
||||
void bwa_idx_destroy(bwaidx_t *idx);
|
||||
int bwa_idx2mem(bwaidx_t *idx);
|
||||
int bwa_fmtidx2mem(bwaidx_t *idx);
|
||||
int bwa_ertidx2mem(bwaidx_t *idx);
|
||||
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
|
||||
int bwa_mem2fmtidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
|
||||
int bwa_mem2ertidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
|
||||
char *bwa_set_rg(const char *s);
|
||||
|
|
|
|||
90
bwashm.c
90
bwashm.c
|
|
@ -9,8 +9,9 @@
|
|||
#include <stdio.h>
|
||||
#include "bwa.h"
|
||||
|
||||
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
||||
int bwa_shm_stage(bwaidx_t *idx, const char *hint, int useERT, const char *_tmpfn)
|
||||
{
|
||||
|
||||
const char *name;
|
||||
uint8_t *shm, *shm_idx;
|
||||
uint16_t *cnt;
|
||||
|
|
@ -26,7 +27,7 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
|||
to_init = 1;
|
||||
}
|
||||
if (shmid < 0) return -1;
|
||||
ftruncate(shmid, BWA_CTL_SIZE);
|
||||
if (ftruncate(shmid, BWA_CTL_SIZE) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (to_init) {
|
||||
|
|
@ -34,7 +35,11 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
|||
cnt[1] = 4;
|
||||
}
|
||||
|
||||
if (idx->mem == 0) bwa_idx2mem(idx);
|
||||
// 将所有索引数据放在连续的内存空间里
|
||||
if (idx->mem == 0) {
|
||||
if (useERT) bwa_ertidx2mem(idx);
|
||||
else bwa_fmtidx2mem(idx);
|
||||
}
|
||||
|
||||
if (tmpfn) {
|
||||
FILE *fp;
|
||||
|
|
@ -63,7 +68,7 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
|||
memcpy(shm + cnt[1], &idx->l_mem, 8);
|
||||
memcpy(shm + cnt[1] + 8, name, l - 8);
|
||||
cnt[1] += l; ++cnt[0];
|
||||
ftruncate(shmid, idx->l_mem);
|
||||
if (ftruncate(shmid, idx->l_mem) < 0) return -1;
|
||||
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
if (tmpfn) {
|
||||
FILE *fp;
|
||||
|
|
@ -79,11 +84,58 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
|||
memcpy(shm_idx, idx->mem, idx->l_mem);
|
||||
free(idx->mem);
|
||||
}
|
||||
bwa_mem2idx(idx->l_mem, shm_idx, idx);
|
||||
if (useERT)
|
||||
bwa_mem2ertidx(idx->l_mem, shm_idx, idx);
|
||||
else
|
||||
bwa_mem2fmtidx(idx->l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define INIT_SHM_LOAD \
|
||||
const char *name; \
|
||||
uint8_t *shm, *shm_idx; \
|
||||
uint16_t *cnt, i; \
|
||||
char *p, path[PATH_MAX + 1]; \
|
||||
int shmid; \
|
||||
int64_t l_mem; \
|
||||
bwaidx_t *idx; \
|
||||
if (hint == 0 || hint[0] == 0) return 0; \
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); \
|
||||
++name; \
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0; \
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); \
|
||||
cnt = (uint16_t*)shm; \
|
||||
if (cnt[0] == 0) return 0; \
|
||||
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) { \
|
||||
memcpy(&l_mem, p, 8); p += 8; \
|
||||
if (strcmp(p, name) == 0) break; \
|
||||
p += strlen(p) + 1; \
|
||||
} \
|
||||
if (i == cnt[0]) return 0; \
|
||||
strcat(strcpy(path, "/bwaidx-"), name); \
|
||||
if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0; \
|
||||
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0); \
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
|
||||
bwaidx_t *bwa_ertidx_load_from_shm(const char *hint_)
|
||||
{
|
||||
char hint[PATH_MAX];
|
||||
sprintf(hint, "%s.ert", hint_);
|
||||
INIT_SHM_LOAD;
|
||||
bwa_mem2ertidx(l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return idx;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_fmtidx_load_from_shm(const char *hint)
|
||||
{
|
||||
INIT_SHM_LOAD;
|
||||
bwa_mem2fmtidx(l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return idx;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
|
||||
{
|
||||
const char *name;
|
||||
|
|
@ -178,18 +230,22 @@ int bwa_shm_destroy(void)
|
|||
|
||||
int main_shm(int argc, char *argv[])
|
||||
{
|
||||
int c, to_list = 0, to_drop = 0, ret = 0;
|
||||
int c, to_list = 0, to_drop = 0, ret = 0, useERT = 0;
|
||||
char *tmpfn = 0;
|
||||
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
|
||||
char shm_prefix[PATH_MAX];
|
||||
while ((c = getopt(argc, argv, "ldf:Z")) >= 0)
|
||||
{
|
||||
if (c == 'l') to_list = 1;
|
||||
else if (c == 'd') to_drop = 1;
|
||||
else if (c == 'f') tmpfn = optarg;
|
||||
else if (c == 'Z') useERT = 1;
|
||||
}
|
||||
if (optind == argc && !to_list && !to_drop) {
|
||||
fprintf(stderr, "\nUsage: fastbwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
|
||||
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||||
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
|
||||
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n");
|
||||
fprintf(stderr, " -Z use Ert as seeding index\n\n");
|
||||
return 1;
|
||||
}
|
||||
if (optind < argc && (to_list || to_drop)) {
|
||||
|
|
@ -197,15 +253,25 @@ int main_shm(int argc, char *argv[])
|
|||
return 1;
|
||||
}
|
||||
if (optind < argc) {
|
||||
if (bwa_shm_test(argv[optind]) == 0) {
|
||||
if (useERT)
|
||||
sprintf(shm_prefix, "%s.ert", argv[optind]);
|
||||
else
|
||||
sprintf(shm_prefix, "%s", argv[optind]);
|
||||
if (bwa_shm_test(shm_prefix) == 0)
|
||||
{
|
||||
bwaidx_t *idx;
|
||||
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
|
||||
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
|
||||
if (useERT)
|
||||
idx = bwa_ertidx_load_from_disk(argv[optind]);
|
||||
else
|
||||
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC | BWA_IDX_FMT);
|
||||
if (bwa_shm_stage(idx, shm_prefix, useERT, tmpfn) < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||
ret = 1;
|
||||
}
|
||||
bwa_idx_destroy(idx);
|
||||
} else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
|
||||
}
|
||||
if (to_list) bwa_shm_list();
|
||||
if (to_drop) bwa_shm_destroy();
|
||||
|
|
|
|||
10
bwt_gen.c
10
bwt_gen.c
|
|
@ -876,7 +876,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r
|
|||
bgint_t i, c;
|
||||
bgint_t s, r;
|
||||
bgint_t lastRank, lastIndex;
|
||||
bgint_t oldInverseSa0RelativeRank = 0;
|
||||
// bgint_t oldInverseSa0RelativeRank = 0;
|
||||
bgint_t freq;
|
||||
|
||||
lastIndex = numItem;
|
||||
|
|
@ -887,7 +887,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r
|
|||
s = seq[numItem];
|
||||
relativeRank[s] = numItem;
|
||||
if (lastRank == oldInverseSa0) {
|
||||
oldInverseSa0RelativeRank = numItem;
|
||||
// oldInverseSa0RelativeRank = numItem;
|
||||
oldInverseSa0++; // so that this segment of code is not run again
|
||||
lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
|
||||
}
|
||||
|
|
@ -920,7 +920,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r
|
|||
lastRank = r;
|
||||
relativeRank[s] = i;
|
||||
if (r == oldInverseSa0) {
|
||||
oldInverseSa0RelativeRank = i;
|
||||
// oldInverseSa0RelativeRank = i;
|
||||
oldInverseSa0++; // so that this segment of code is not run again
|
||||
lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
|
||||
}
|
||||
|
|
@ -950,14 +950,14 @@ static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank,
|
|||
static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
|
||||
unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt)
|
||||
{
|
||||
unsigned int bitsInWordMinusBitPerChar;
|
||||
// unsigned int bitsInWordMinusBitPerChar;
|
||||
bgint_t leftShift, rightShift;
|
||||
bgint_t o;
|
||||
bgint_t oIndex, iIndex, mIndex;
|
||||
bgint_t mWord, mChar, oWord, oChar;
|
||||
bgint_t numInsert;
|
||||
|
||||
bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
|
||||
// bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
|
||||
|
||||
oIndex = 0;
|
||||
iIndex = 0;
|
||||
|
|
|
|||
39
bwtindex.c
39
bwtindex.c
|
|
@ -36,6 +36,7 @@
|
|||
#include "utils.h"
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
#include "ertindex.h"
|
||||
|
||||
#ifdef _DIVBWT
|
||||
#include "divsufsort.h"
|
||||
|
|
@ -258,8 +259,41 @@ int bwa_build_kmer(int argc, char *argv[])
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2ert(int argc, char *argv[])
|
||||
{
|
||||
char prefix[1024];
|
||||
char ert_kmer_file[1124];
|
||||
int c, num_threads = 1;
|
||||
while ((c = getopt(argc, argv, "t:")) >= 0)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case 't':
|
||||
num_threads = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 1 > argc)
|
||||
{
|
||||
fprintf(stderr, "Usage: fastbwa bwt2ert <index_prefix>\n");
|
||||
return 1;
|
||||
}
|
||||
// fprintf(stderr, "%d %d %d\n", optind, argc, num_threads);
|
||||
sprintf(prefix, "%s", argv[optind]);
|
||||
sprintf(ert_kmer_file, "%s.%s", prefix, "ert.kmer.table");
|
||||
fprintf(stderr, "%s\n", ert_kmer_file);
|
||||
// Load BWT index
|
||||
bwaidx_t *bid = bwa_idx_load_from_disk(prefix, BWA_IDX_BNS | BWA_IDX_BWT | BWA_IDX_PAC);
|
||||
// Build ERT
|
||||
buildERTKmerTrees(ert_kmer_file, bid, prefix, num_threads, ERT_MAX_READ_LEN);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_index(int argc, char *argv[]) // the "index" command
|
||||
{
|
||||
int num_threads = 1;
|
||||
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
|
||||
char *prefix = 0, *str;
|
||||
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
|
||||
|
|
@ -272,6 +306,10 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
|||
break;
|
||||
case 'p': prefix = strdup(optarg); break;
|
||||
case '6': is_64 = 1; break;
|
||||
case 't':
|
||||
num_threads = atoi(optarg);
|
||||
assert(num_threads > 0 && num_threads < 256);
|
||||
break;
|
||||
case 'b':
|
||||
block_size = strtol(optarg, &str, 10);
|
||||
if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
|
||||
|
|
@ -287,6 +325,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
|||
fprintf(stderr, "Usage: fastbwa index [options] <in.fasta>\n\n");
|
||||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
|
||||
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
||||
fprintf(stderr, " -t INT number of threads for ERT index building [%d]\n", num_threads);
|
||||
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
|
||||
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
||||
fprintf(stderr, "\n");
|
||||
|
|
|
|||
|
|
@ -0,0 +1,946 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <limits.h>
|
||||
#include <pthread.h>
|
||||
#include <errno.h>
|
||||
#include <math.h>
|
||||
#include "utils.h"
|
||||
#include "ertindex.h"
|
||||
|
||||
#define _set_pac(pac, l, c) ((pac)[(l) >> 2] |= (c) << (((l) & 3) << 1))
|
||||
#define _set_pac_orig(pac, l, c) ((pac)[(l) >> 2] |= (c) << ((~(l) & 3) << 1))
|
||||
|
||||
const uint8_t char_count_size_in_bits = 8;
|
||||
const uint8_t hits_count_size_in_bits = 8;
|
||||
const uint8_t ref_ptr_size_in_bits = 40;
|
||||
const uint8_t leaf_offset_ptr_size_in_bits = 8;
|
||||
const uint8_t other_offset_ptr_size_in_bits = 32;
|
||||
|
||||
// int test_max_intv = 0;
|
||||
|
||||
static inline void getNumBranchesForKmer(bwtintv_t ok[4], int* numBranches, uint8_t* uniform_bp) {
|
||||
uint8_t i;
|
||||
for (i = 0; i < 4; ++i) {
|
||||
if (ok[i].x[2] > 0) { *numBranches += 1; *uniform_bp = i; }
|
||||
}
|
||||
}
|
||||
|
||||
// 从右向左的顺序将kmer转换成query
|
||||
static inline void kmertoquery(uint64_t x, uint8_t *a, int l)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < l; ++i) {
|
||||
a[i] = (uint8_t)((x >> (i << 1)) & 0x3);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint64_t addBytesForEntry(byte_type_t type, int count, int numHits)
|
||||
{
|
||||
uint64_t numBits = 0;
|
||||
switch(type) {
|
||||
case CODE:
|
||||
numBits = 8;
|
||||
break;
|
||||
case LEAF_COUNT:
|
||||
numBits = (hits_count_size_in_bits);
|
||||
break;
|
||||
case LEAF_HITS:
|
||||
numBits = (ref_ptr_size_in_bits * numHits);
|
||||
break;
|
||||
case UNIFORM_COUNT: // "Uniform"
|
||||
numBits = (char_count_size_in_bits);
|
||||
break;
|
||||
case UNIFORM_BP:
|
||||
numBits = (count << 1);
|
||||
break;
|
||||
case LEAF_PTR: // "Leaf Offset Pointer"
|
||||
numBits = leaf_offset_ptr_size_in_bits;
|
||||
break;
|
||||
case OTHER_PTR:
|
||||
numBits = other_offset_ptr_size_in_bits;
|
||||
break;
|
||||
case EMPTY_NODE:
|
||||
numBits = 0;
|
||||
break;
|
||||
default :
|
||||
break;
|
||||
}
|
||||
return (numBits % 8) ? (numBits / 8 + 1) : numBits / 8;
|
||||
}
|
||||
|
||||
void addChildNode(node_t* p, node_t* c) {
|
||||
assert(p->numChildren <= 3);
|
||||
p->child_nodes[p->numChildren++] = c;
|
||||
c->parent_node = p;
|
||||
}
|
||||
|
||||
void handleLeaf(const bwaidx_t *bid, bwtintv_t ik, node_t *n, int step)
|
||||
{
|
||||
n->type = LEAF;
|
||||
n->numHits = ik.x[2];
|
||||
n->hits = (uint64_t *)calloc(n->numHits, sizeof(uint64_t));
|
||||
assert(n->hits != NULL);
|
||||
if (step == 1) {
|
||||
uint64_t ref_pos = 0;
|
||||
int j = 0;
|
||||
for (j = 0; j < n->numHits; ++j) {
|
||||
ref_pos = fmt_sa(bid->fmt, ik.x[0] + j);
|
||||
// ref_pos = bwt_sa(bid->bwt, ik.x[0] + j);
|
||||
n->hits[j] = ref_pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void handleDivergence(const bwaidx_t *bid, bwtintv_t ok[4], int depth, node_t *parent_node, int step, int max_depth)
|
||||
{
|
||||
int i;
|
||||
bwtintv_t ok_copy[4];
|
||||
bwtintv_t ik_new;
|
||||
memcpy_bwamem(ok_copy, 4 * sizeof(bwtintv_t), ok, 4 * sizeof(bwtintv_t), __FILE__, __LINE__);
|
||||
for (i = 3; i >= 0; --i) {
|
||||
node_t *n = (node_t *)calloc(1, sizeof(node_t));
|
||||
assert(n != NULL);
|
||||
n->numChildren = 0;
|
||||
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
|
||||
n->pos = 0;
|
||||
n->num_bp = 0;
|
||||
if (ok_copy[i].x[2] == 0) { //!< Empty node
|
||||
n->type = EMPTY;
|
||||
n->numHits = 0;
|
||||
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, (depth - 1) * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
n->l_seq = depth;
|
||||
addChildNode(parent_node, n);
|
||||
} else if (ok_copy[i].x[2] > 1 && depth != max_depth) { // 没到最大长度,可以继续扩展
|
||||
ik_new = ok_copy[i]; ik_new.info = depth + 1;
|
||||
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
assert(depth >= 0);
|
||||
n->seq[depth] = i;
|
||||
n->pos = depth;
|
||||
n->num_bp = 1;
|
||||
n->l_seq = depth + 1;
|
||||
n->numHits = ok_copy[i].x[2];
|
||||
n->type = DIVERGE;
|
||||
addChildNode(parent_node, n);
|
||||
ert_build_kmertree(bid, ik_new, ok, depth + 1, n, step, max_depth);
|
||||
} else { // leaf intv.x[2] == 1 or depth = max_depth
|
||||
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
n->seq[depth] = i;
|
||||
n->pos = depth;
|
||||
n->num_bp = 1;
|
||||
n->l_seq = depth + 1;
|
||||
handleLeaf(bid, ok_copy[i], n, step);
|
||||
addChildNode(parent_node, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ert_build_kmertree(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], int curDepth, node_t *parent_node, int step, int max_depth)
|
||||
{
|
||||
uint8_t uniform_bp = 0;
|
||||
int numBranches = 0, depth = curDepth;
|
||||
bwt_extend(bid->bwt, &ik, ok, 0); //!< Extend right by 1bp
|
||||
/// Check if we need to make a uniform entry
|
||||
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
|
||||
bwtintv_t ik_new;
|
||||
/// If we find a uniform entry, extend further till we diverge or hit a leaf node
|
||||
if (numBranches == 1) {
|
||||
uint8_t uniformExtend = 1;
|
||||
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
|
||||
node_t *n = (node_t *)calloc(1, sizeof(node_t));
|
||||
assert(n != NULL);
|
||||
n->numChildren = 0;
|
||||
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
|
||||
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
assert(depth >= 0);
|
||||
n->seq[depth] = uniform_bp; // ?放在最后的位置上
|
||||
n->numHits = ok[uniform_bp].x[2];
|
||||
n->l_seq = depth + 1; // depth=16
|
||||
n->pos = depth;
|
||||
n->num_bp = 1;
|
||||
addChildNode(parent_node, n);
|
||||
if (depth < max_depth) {
|
||||
bwtintv_t ok_init;
|
||||
memcpy_bwamem(&ok_init, sizeof(bwtintv_t), &ok[uniform_bp], sizeof(bwtintv_t), __FILE__, __LINE__);
|
||||
while (uniformExtend) { // 一个一个bp进行扩展
|
||||
numBranches = 0; uniform_bp = 0;
|
||||
depth += 1;
|
||||
bwt_extend(bid->bwt, &ik_new, ok, 0); //!< Extend right by 1bp
|
||||
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
|
||||
assert(numBranches != 0);
|
||||
if (numBranches == 1) { //<! Uniform
|
||||
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
|
||||
n->seq[depth] = uniform_bp;
|
||||
n->l_seq = depth + 1;
|
||||
n->num_bp++;
|
||||
/// Hit a multi-hit leaf node
|
||||
if (depth == max_depth) {
|
||||
// if (test_max_intv < ik_new.x[2]) test_max_intv = ik_new.x[2];
|
||||
// fprintf(stderr, "depth: %d, num hit: %ld, max: %d\n", depth, ik_new.x[2], test_max_intv);
|
||||
uniformExtend = 0;
|
||||
handleLeaf(bid, ok_init, n, step);
|
||||
}
|
||||
} else { //!< Diverge
|
||||
uniformExtend = 0;
|
||||
n->type = UNIFORM;
|
||||
handleDivergence(bid, ok, depth, n, step, max_depth);
|
||||
}
|
||||
}
|
||||
} else { //<! Uniform, depth == max_depth, multi-hit leaf node
|
||||
uniformExtend = 0;
|
||||
handleLeaf(bid, ok[uniform_bp], n, step);
|
||||
}
|
||||
} //!< End uniform entry
|
||||
else { //!< Diverge, empty or leaf, same as above
|
||||
handleDivergence(bid, ok, depth, parent_node, step, max_depth);
|
||||
} //!< End diverge
|
||||
}
|
||||
|
||||
void ert_build_table(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], uint8_t *mlt_data, uint8_t *mh_data,
|
||||
uint64_t *size, uint64_t *mh_size, uint8_t *aq, uint64_t *numHits, uint64_t *max_next_ptr,
|
||||
uint64_t next_ptr_width, int step, int max_depth)
|
||||
{
|
||||
uint64_t byte_idx = *size;
|
||||
uint64_t mh_byte_idx = *mh_size;
|
||||
int i, j;
|
||||
uint8_t aq1[xmerSize];
|
||||
assert(xmerSize <= 15);
|
||||
uint64_t lep1 = 0;
|
||||
uint8_t c;
|
||||
uint64_t prevHits = ik.x[2];
|
||||
bwtintv_t ik_copy = ik;
|
||||
uint64_t mlt_byte_idx = byte_idx + (numXmers << 3);
|
||||
uint64_t xmer_entry = 0;
|
||||
uint16_t xmer_data = 0;
|
||||
uint64_t mlt_offset = mlt_byte_idx;
|
||||
for (i = 0; i < numXmers; ++i) {
|
||||
kmertoquery(i, aq1, xmerSize);
|
||||
for (j = 0; j < xmerSize; ++j) {
|
||||
c = 3 - aq1[j];
|
||||
bwt_extend(bid->bwt, &ik, ok, 0); //!< ok contains the result of BWT extension
|
||||
if (ok[c].x[2] != prevHits) { //!< hit set changes
|
||||
lep1 |= (1ULL << j);
|
||||
}
|
||||
/// Extend right till k-mer has zero hits
|
||||
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = kmerSize + j + 1; }
|
||||
else { break; }
|
||||
}
|
||||
uint64_t num_hits = ok[c].x[2];
|
||||
if (num_hits == 0) {
|
||||
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
|
||||
}
|
||||
else if (num_hits == 1) {
|
||||
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
|
||||
if (step == 1) {
|
||||
mlt_data[mlt_byte_idx] = 0; //!< Not a multi-hit
|
||||
}
|
||||
mlt_byte_idx++;
|
||||
if (step == 1) {
|
||||
uint64_t ref_pos = 0;
|
||||
ref_pos = fmt_sa(bid->fmt, ok[c].x[0]);
|
||||
// ref_pos = bwt_sa(bid->bwt, ok[c].x[0]);
|
||||
uint64_t leaf_data = ref_pos << 1;
|
||||
memcpy_bwamem(&mlt_data[mlt_byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
mlt_byte_idx += 5;
|
||||
*numHits += 1;
|
||||
}
|
||||
else {
|
||||
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
|
||||
node_t *n = (node_t *)calloc(1, sizeof(node_t));
|
||||
assert(n != NULL);
|
||||
n->type = DIVERGE;
|
||||
n->pos = 0;
|
||||
n->num_bp = 0;
|
||||
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize, __FILE__, __LINE__);
|
||||
n->l_seq = kmerSize;
|
||||
memcpy_bwamem(&n->seq[n->l_seq], xmerSize * sizeof(uint8_t), aq1, xmerSize * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
n->l_seq += xmerSize;
|
||||
n->parent_node = 0;
|
||||
n->numChildren = 0;
|
||||
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
|
||||
n->start_addr = mlt_byte_idx;
|
||||
ert_build_kmertree(bid, ik, ok, kmerSize + j, n, step, max_depth);
|
||||
ert_traverse_kmertree(n, mlt_data, mh_data, &mlt_byte_idx, &mh_byte_idx, kmerSize + j, numHits,
|
||||
max_next_ptr, next_ptr_width, step);
|
||||
ert_destroy_kmertree(n);
|
||||
}
|
||||
if (num_hits < 20) {
|
||||
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | (num_hits << 17) | xmer_data;
|
||||
}
|
||||
else {
|
||||
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | xmer_data;
|
||||
}
|
||||
uint64_t ptr_width = (next_ptr_width < 4) ? next_ptr_width : 0;
|
||||
xmer_entry |= (ptr_width << 22);
|
||||
if (step == 1) {
|
||||
memcpy_bwamem(&mlt_data[byte_idx], 8 * sizeof(uint8_t), &xmer_entry, 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
byte_idx += 8;
|
||||
mlt_offset = mlt_byte_idx;
|
||||
ik = ik_copy;
|
||||
prevHits = ik_copy.x[2];
|
||||
}
|
||||
*size = mlt_byte_idx;
|
||||
*mh_size = mh_byte_idx;
|
||||
}
|
||||
|
||||
void addCode(uint8_t *mlt_data, uint64_t *byte_idx, uint8_t code, int step)
|
||||
{
|
||||
if (step == 1) {
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint8_t), &code, sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += 1;
|
||||
}
|
||||
|
||||
void addUniformNode(uint8_t *mlt_data, uint64_t *byte_idx, int count, uint8_t *uniformBases, uint64_t hitCount, int step)
|
||||
{
|
||||
int numBytesForBP = addBytesForEntry(UNIFORM_BP, count, 0);
|
||||
assert(count < ERT_MAX_READ_LEN);
|
||||
if (step == 1) {
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint16_t), &count, sizeof(uint16_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += 2;
|
||||
if (step == 1) {
|
||||
int j;
|
||||
uint8_t packUniformBases[numBytesForBP];
|
||||
memset(packUniformBases, 0, numBytesForBP * sizeof(uint8_t));
|
||||
for (j = 0; j < count; ++j) {
|
||||
_set_pac_orig(packUniformBases, j, uniformBases[j]);
|
||||
}
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], numBytesForBP * sizeof(uint8_t), packUniformBases, numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += numBytesForBP;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void addLeafNode(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t ref_pos, int step) {
|
||||
if (step == 1) {
|
||||
uint64_t leaf_data = (ref_pos << 1);
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += 5;
|
||||
}
|
||||
|
||||
void addMultiHitLeafNode(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, uint64_t* hits, int step)
|
||||
{
|
||||
uint16_t k = 0;
|
||||
for (k = 0; k < count; ++k) {
|
||||
if (step == 1) {
|
||||
uint64_t leaf_data = (hits[k] << 1) | 1ULL;
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += 5;
|
||||
}
|
||||
}
|
||||
|
||||
void addMultiHitLeafCount(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, int step)
|
||||
{
|
||||
if (step == 1) {
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], 2 * sizeof(uint8_t), &count, 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += 2;
|
||||
}
|
||||
|
||||
void addMultiHitLeafPtr(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t mh_byte_idx, int step)
|
||||
{
|
||||
if (step == 1) {
|
||||
uint64_t mh_data = (mh_byte_idx << 1) | 1ULL;
|
||||
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &mh_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
}
|
||||
*byte_idx += 5;
|
||||
}
|
||||
|
||||
void ert_traverse_kmertree(node_t *n, uint8_t *mlt_data, uint8_t *mh_data, uint64_t *size, uint64_t *mh_size, int depth, uint64_t *numHits, uint64_t *max_ptr, uint64_t next_ptr_width, int step)
|
||||
{
|
||||
int j = 0;
|
||||
int cur_depth = depth;
|
||||
uint64_t byte_idx = *size;
|
||||
uint64_t mh_byte_idx = *mh_size; // 匹配位置开始记录的索引位置
|
||||
uint8_t code = 0;
|
||||
assert(n->numChildren != 0);
|
||||
if (n->numChildren == 1) {
|
||||
node_t *child = n->child_nodes[0];
|
||||
uint8_t c = child->seq[child->pos];
|
||||
if (child->type == LEAF) {
|
||||
//
|
||||
// FIXME: In rare cases, when one of the occurrences of the k-mer is at the end of the reference,
|
||||
// # hits for parent node is not equal to the sum of #hits of children nodes, and we trigger the assertion below
|
||||
// This should not affect results as long as readLength > kmerSize
|
||||
// assert(child->numHits > 1);
|
||||
code |= (LEAF << (c << 1));
|
||||
addCode(mlt_data, &byte_idx, code, step);
|
||||
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
|
||||
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
|
||||
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
|
||||
*numHits += child->numHits;
|
||||
}
|
||||
else {
|
||||
assert(child->type == UNIFORM);
|
||||
code |= (UNIFORM << (c << 1));
|
||||
addCode(mlt_data, &byte_idx, code, step);
|
||||
addUniformNode(mlt_data, &byte_idx, child->num_bp, &child->seq[child->pos], child->numHits, step);
|
||||
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx, cur_depth + child->num_bp,
|
||||
numHits, max_ptr, next_ptr_width, step);
|
||||
}
|
||||
}
|
||||
else {
|
||||
uint8_t numEmpty = 0, numLeaves = 0;
|
||||
for (j = 0; j < n->numChildren; ++j) {
|
||||
node_t *child = n->child_nodes[j];
|
||||
uint8_t c = child->seq[child->pos];
|
||||
if (child->type == EMPTY) { numEmpty++; }
|
||||
else if (child->type == LEAF) { numLeaves++; code |= (LEAF << (c << 1)); }
|
||||
else { code |= (DIVERGE << (c << 1)); }
|
||||
}
|
||||
uint8_t numPointers = ((4 - numEmpty - numLeaves) > 0) ? (4 - numEmpty - numLeaves) : 0;
|
||||
uint64_t start_byte_idx = byte_idx;
|
||||
addCode(mlt_data, &byte_idx, code, step);
|
||||
uint64_t ptr_byte_idx = byte_idx;
|
||||
uint64_t ptrToOtherNodes[numPointers + 1]; //!< These point to children. We have one more child than number of pointers
|
||||
memset(ptrToOtherNodes, 0, (numPointers + 1) * sizeof(uint64_t));
|
||||
uint64_t numHitsForChildren[numPointers + 1];
|
||||
memset(numHitsForChildren, 0, (numPointers + 1) * sizeof(uint64_t));
|
||||
uint64_t other_idx = 0;
|
||||
if (numPointers > 0) { byte_idx += (numPointers * next_ptr_width); }
|
||||
for (j = 0; j < n->numChildren; ++j) {
|
||||
node_t *child = n->child_nodes[j];
|
||||
if (child->type == LEAF) {
|
||||
if (child->numHits == 1) {
|
||||
addLeafNode(mlt_data, &byte_idx, child->hits[0], step);
|
||||
}
|
||||
else {
|
||||
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
|
||||
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
|
||||
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numPointers > 0) { ptrToOtherNodes[other_idx] = byte_idx; }
|
||||
for (j = 0; j < n->numChildren; ++j) {
|
||||
node_t *child = n->child_nodes[j];
|
||||
assert(child->type != UNIFORM); // why?
|
||||
if (child->type == DIVERGE) {
|
||||
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx,
|
||||
cur_depth+1, numHits, max_ptr, next_ptr_width, step);
|
||||
numHitsForChildren[other_idx] = child->numHits;
|
||||
other_idx++;
|
||||
ptrToOtherNodes[other_idx] = byte_idx;
|
||||
}
|
||||
}
|
||||
for (j = 0; j < numPointers; ++j) {
|
||||
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
|
||||
if (pointerToNextNode > *max_ptr) {
|
||||
*max_ptr = pointerToNextNode;
|
||||
}
|
||||
assert(pointerToNextNode < (1 << 26));
|
||||
}
|
||||
// Fill up pointers based on size of previous children
|
||||
if (step == 1) {
|
||||
for (j = 0; j < numPointers; ++j) {
|
||||
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
|
||||
assert(pointerToNextNode < (1 << 26));
|
||||
uint64_t reseed_data = 0;
|
||||
if (numHitsForChildren[j] < 20) {
|
||||
reseed_data = (pointerToNextNode << 6) | (numHitsForChildren[j]);
|
||||
} else {
|
||||
reseed_data = (pointerToNextNode << 6);
|
||||
}
|
||||
memcpy_bwamem(&mlt_data[ptr_byte_idx], next_ptr_width * sizeof(uint8_t), &reseed_data, next_ptr_width * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
ptr_byte_idx += next_ptr_width;
|
||||
}
|
||||
}
|
||||
}
|
||||
*size = byte_idx;
|
||||
*mh_size = mh_byte_idx;
|
||||
}
|
||||
|
||||
void ert_destroy_kmertree(node_t* n)
|
||||
{
|
||||
int j;
|
||||
if (n == NULL) {
|
||||
return;
|
||||
}
|
||||
if (n->hits) {
|
||||
free(n->hits);
|
||||
}
|
||||
for (j = 0; j < n->numChildren; ++j) {
|
||||
ert_destroy_kmertree(n->child_nodes[j]);
|
||||
}
|
||||
free(n);
|
||||
}
|
||||
|
||||
//
|
||||
// This function builds the ERT index.
|
||||
// Note on pointers to child nodes: When building the radix tree for each k-mer,
|
||||
// we try 3 values for pointers to child nodes, 2,3,4 B and choose the smallest
|
||||
// one possible.
|
||||
//
|
||||
void *buildIndex(void *arg)
|
||||
{
|
||||
thread_data_t *data = (thread_data_t *)arg;
|
||||
bwtintv_t ik, ok[4];
|
||||
uint64_t idx = 0;
|
||||
uint8_t aq[kmerSize];
|
||||
int i;
|
||||
uint8_t c;
|
||||
uint64_t lep, prevHits, numBytesPerKmer, numBytesForMh, ref_pos, total_hits = 0, ptr = 0, max_next_ptr = 0;
|
||||
uint64_t next_ptr_width = 0;
|
||||
uint64_t nKmerSmallPtr = 0, nKmerMedPtr = 0, nKmerLargePtr = 0;
|
||||
uint16_t kmer_data = 0;
|
||||
|
||||
|
||||
// File to write the multi-level tree index
|
||||
char ml_tbl_file_name[PATH_MAX];
|
||||
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%d", data->filePrefix, data->tid);
|
||||
|
||||
// Log progress
|
||||
char log_file_name[PATH_MAX];
|
||||
sprintf(log_file_name, "%s.log_%d", data->filePrefix, data->tid);
|
||||
|
||||
FILE *ml_tbl_fd = 0, *log_fd = 0;
|
||||
ml_tbl_fd = fopen(ml_tbl_file_name, "wb");
|
||||
if (ml_tbl_fd == NULL)
|
||||
{
|
||||
printf("\nCan't open file or file doesn't exist. mlt_table errno = %d\n", errno);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
if (bwa_verbose >= 4)
|
||||
{
|
||||
log_fd = fopen(log_file_name, "w");
|
||||
if (log_fd == NULL)
|
||||
{
|
||||
printf("\nCan't open file or file doesn't exist. log errno = %d\n", errno);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
log_file(log_fd, "Start: %lu End: %lu", data->startKmer, data->endKmer);
|
||||
}
|
||||
|
||||
//
|
||||
// Loop for each k-mer and compute LEP when the hit set changes
|
||||
//
|
||||
double tmp_time = realtime(), elapsed_time;
|
||||
int p = 0;
|
||||
for (idx = data->startKmer; idx < data->endKmer; ++idx)
|
||||
{
|
||||
max_next_ptr = 0;
|
||||
next_ptr_width = 0;
|
||||
c = 0;
|
||||
lep = 0; // k-1-bit LEP
|
||||
prevHits = 0;
|
||||
numBytesPerKmer = 0;
|
||||
numBytesForMh = 0;
|
||||
kmertoquery(idx, aq, kmerSize); // represent k-mer as uint8_t*
|
||||
assert(aq[0] >= 0 && aq[0] <= 3);
|
||||
bwt_set_intv(data->bid->bwt, aq[0], ik); // the initial interval of a single base
|
||||
ik.info = 1;
|
||||
prevHits = ik.x[2];
|
||||
|
||||
// 打印进度信息
|
||||
if (data->tid == 0 && (idx - data->startKmer) % ((data->endKmer - data->startKmer) / 100) == 0) {
|
||||
if (p++ > 0) {
|
||||
elapsed_time = realtime() - tmp_time;
|
||||
fprintf(stderr, "[step: %d] %d%% percent complished. %f s elapsed.\n", data->step, p - 1, elapsed_time);
|
||||
}
|
||||
}
|
||||
//
|
||||
// Backward search k-mer
|
||||
//
|
||||
for (i = 1; i < kmerSize; ++i)
|
||||
{
|
||||
c = 3 - aq[i];
|
||||
bwt_extend(data->bid->bwt, &ik, ok, 0); // ok contains the result of BWT extension
|
||||
if (ok[c].x[2] != prevHits) { // hit set changes
|
||||
lep |= (1ULL << (i - 1));
|
||||
}
|
||||
//
|
||||
// Extend left till k-mer has zero hits
|
||||
//
|
||||
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = i + 1; }
|
||||
else { break; }
|
||||
}
|
||||
|
||||
uint64_t num_hits = ok[c].x[2];
|
||||
if (num_hits == 0) { // "Empty" - k-mer absent in the reference genome
|
||||
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
|
||||
} else if (num_hits == 1) { // "Leaf" - k-mer has a single hit in the reference genome
|
||||
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
|
||||
numBytesPerKmer = 6; // 该15-kmer下所包含的所有匹配位置信息只需要6字节就能表示(因为只有一个匹配)
|
||||
uint8_t byte_idx = 0;
|
||||
uint8_t mlt_data[numBytesPerKmer];
|
||||
if (data->step == 1) { mlt_data[byte_idx] = 0; } // Mark that the hit is not a multi-hit
|
||||
byte_idx++;
|
||||
data->numHits[idx - data->startKmer] += num_hits;
|
||||
if (data->step == 1) {
|
||||
// Look up suffix array to identify the hit position
|
||||
ref_pos = fmt_sa(data->bid->fmt, ok[c].x[0]);
|
||||
// ref_pos = bwt_sa(data->bid->bwt, ok[c].x[0]);
|
||||
uint64_t leaf_data = ref_pos << 1;
|
||||
memcpy_bwamem(&mlt_data[byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
|
||||
}
|
||||
byte_idx += 5;
|
||||
}
|
||||
//
|
||||
// If the number of hits for the k-mer does not exceed the HIT_THRESHOLD,
|
||||
// prefer a radix-tree over a multi-level table as the radix tree for the
|
||||
// k-mer is likely to be sparse.
|
||||
//
|
||||
else if (num_hits <= HIT_THRESHOLD) {
|
||||
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
|
||||
node_t *n = (node_t *)calloc(1, sizeof(node_t));
|
||||
assert(n != NULL);
|
||||
n->type = DIVERGE;
|
||||
n->pos = 0;
|
||||
n->num_bp = 0;
|
||||
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize * sizeof(uint8_t), __FILE__, __LINE__);
|
||||
n->l_seq = kmerSize;
|
||||
n->parent_node = 0;
|
||||
n->numChildren = 0;
|
||||
n->numHits = num_hits;
|
||||
n->child_nodes[0] = n->child_nodes[1] = n->child_nodes[2] = n->child_nodes[3] = 0;
|
||||
n->start_addr = 0;
|
||||
uint8_t *mlt_data = 0;
|
||||
next_ptr_width = 2;
|
||||
uint8_t *mh_data = 0;
|
||||
uint64_t size = 0;
|
||||
if (data->step == 1) {
|
||||
if (idx != (numKmers - 1)) { // 不等于最后一个处理的kmer
|
||||
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH); // 本kmer所占存储大小
|
||||
assert(size < (1 << 26)); // 小于64MB
|
||||
} else { // FIXME: This is a hack. We know the size of every k-mer tree except the last-kmer
|
||||
size = 1 << 26;
|
||||
}
|
||||
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
|
||||
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
|
||||
assert(mlt_data != NULL);
|
||||
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
|
||||
assert(mh_data != NULL);
|
||||
}
|
||||
ert_build_kmertree(data->bid, ik, ok, i, n, data->step, data->readLength - 1); // n作为parent node
|
||||
|
||||
// Reserve space for pointer to start of multi-hit address space
|
||||
numBytesPerKmer = 4; // 4个字节的指针
|
||||
|
||||
// Traverse tree and place data in memory space
|
||||
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh,
|
||||
i, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
|
||||
|
||||
if (data->step == 0 || data->step == 1) {
|
||||
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
|
||||
next_ptr_width = 3;
|
||||
max_next_ptr = 0;
|
||||
numBytesPerKmer = 4;
|
||||
numBytesForMh = 0;
|
||||
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
|
||||
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
|
||||
}
|
||||
if (max_next_ptr >= 262144) {
|
||||
next_ptr_width = 4;
|
||||
max_next_ptr = 0;
|
||||
numBytesPerKmer = 4;
|
||||
numBytesForMh = 0;
|
||||
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
|
||||
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
|
||||
}
|
||||
}
|
||||
ert_destroy_kmertree(n);
|
||||
assert(numBytesPerKmer < (1 << 26));
|
||||
// assert(numBytesForMh < (1 << 24));
|
||||
if (data->step == 1) {
|
||||
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
|
||||
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
|
||||
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
|
||||
free(mlt_data);
|
||||
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
|
||||
free(mh_data);
|
||||
}
|
||||
}
|
||||
//
|
||||
// If the number of hits for the k-mer exceeds the HIT_THRESHOLD,
|
||||
// prefer a multi-level table to encode the suffixes for the
|
||||
// k-mer
|
||||
//
|
||||
else {
|
||||
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (FREQUENT);
|
||||
uint8_t *mlt_data = 0;
|
||||
uint8_t *mh_data = 0;
|
||||
next_ptr_width = 2;
|
||||
uint64_t size = 0;
|
||||
if (data->step == 1) {
|
||||
if (idx != (numKmers - 1)) {
|
||||
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH);
|
||||
assert(size < (1 << 26));
|
||||
}
|
||||
else { //!< FIXME: Hack. We do not store the size of the last-kmer
|
||||
size = 1 << 26;
|
||||
}
|
||||
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
|
||||
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
|
||||
assert(mlt_data != NULL);
|
||||
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
|
||||
assert(mh_data != NULL);
|
||||
}
|
||||
numBytesPerKmer = 4;
|
||||
ert_build_table(data->bid, ik, ok, mlt_data, mh_data, &numBytesPerKmer,
|
||||
&numBytesForMh, aq, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step,
|
||||
data->readLength - 1);
|
||||
if (data->step == 0 || data->step == 1) {
|
||||
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
|
||||
next_ptr_width = 3;
|
||||
max_next_ptr = 0;
|
||||
numBytesPerKmer = 4;
|
||||
numBytesForMh = 0;
|
||||
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
|
||||
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
|
||||
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
|
||||
}
|
||||
if (max_next_ptr >= 262144) {
|
||||
next_ptr_width = 4;
|
||||
max_next_ptr = 0;
|
||||
numBytesPerKmer = 4;
|
||||
numBytesForMh = 0;
|
||||
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
|
||||
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
|
||||
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
|
||||
}
|
||||
}
|
||||
//
|
||||
// Traverse tree and place data in memory
|
||||
//
|
||||
assert(numBytesPerKmer < (1 << 26));
|
||||
// assert(numBytesForMh < (1 << 24));
|
||||
if (data->step == 1) {
|
||||
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
|
||||
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
|
||||
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
|
||||
free(mlt_data);
|
||||
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
|
||||
free(mh_data);
|
||||
}
|
||||
}
|
||||
if (num_hits < 20) {
|
||||
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | (num_hits << 17) | kmer_data;
|
||||
}
|
||||
else {
|
||||
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | kmer_data;
|
||||
}
|
||||
|
||||
ptr += (numBytesPerKmer + numBytesForMh);
|
||||
|
||||
if (next_ptr_width == 2) {
|
||||
nKmerSmallPtr++;
|
||||
}
|
||||
else if (next_ptr_width == 3) {
|
||||
nKmerMedPtr++;
|
||||
}
|
||||
else if (next_ptr_width == 4) {
|
||||
nKmerLargePtr++;
|
||||
next_ptr_width = 0;
|
||||
}
|
||||
data->kmer_table[idx - data->startKmer] |= (next_ptr_width << 22);
|
||||
if (bwa_verbose >= 4) {
|
||||
if (idx == data->endKmer-1) {
|
||||
log_file(log_fd, "TotalSize:%lu\n", ptr);
|
||||
}
|
||||
if ((idx-data->startKmer) % 10000000 == 0) {
|
||||
log_file(log_fd, "%lu,%lu,%lu", idx, numBytesPerKmer, ptr);
|
||||
}
|
||||
}
|
||||
|
||||
total_hits += data->numHits[idx - data->startKmer];
|
||||
}
|
||||
|
||||
data->end_offset = ptr;
|
||||
if (bwa_verbose >= 4) {
|
||||
log_file(log_fd, "Hits:%lu\n", total_hits);
|
||||
log_file(log_fd, "nKmersSmallPtrs:%lu", nKmerSmallPtr);
|
||||
log_file(log_fd, "nKmersMedPtrs:%lu", nKmerMedPtr);
|
||||
log_file(log_fd, "nKmersLargePtrs:%lu", nKmerLargePtr);
|
||||
fclose(log_fd);
|
||||
}
|
||||
|
||||
fclose(ml_tbl_fd);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
void buildERTKmerTrees(char *kmer_tbl_file_name, bwaidx_t *bid, char *prefix, int num_threads, int readLength)
|
||||
{
|
||||
FILE *kmer_tbl_fd;
|
||||
pthread_t thr[num_threads];
|
||||
int i, rc;
|
||||
thread_data_t thr_data[num_threads];
|
||||
uint64_t numKmersThread = (uint64_t)ceil(((double)(numKmers)) / num_threads);
|
||||
if (bwa_verbose >= 3)
|
||||
{
|
||||
fprintf(stderr, "[M::%s] Computing tree sizes for each k-mer\n", __func__);
|
||||
}
|
||||
//
|
||||
// STEP 1: Create threads. Each thread builds the index for a fraction of the k-mers
|
||||
//
|
||||
for (i = 0; i < num_threads; ++i)
|
||||
{
|
||||
thr_data[i].tid = i;
|
||||
thr_data[i].step = 0;
|
||||
thr_data[i].readLength = readLength;
|
||||
thr_data[i].bid = bid;
|
||||
thr_data[i].startKmer = i * numKmersThread;
|
||||
thr_data[i].endKmer = ((i + 1) * numKmersThread > numKmers) ? numKmers : (i + 1) * numKmersThread;
|
||||
thr_data[i].end_offset = 0;
|
||||
thr_data[i].filePrefix = prefix;
|
||||
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
|
||||
thr_data[i].kmer_table = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
|
||||
assert(thr_data[i].kmer_table != NULL);
|
||||
thr_data[i].numHits = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
|
||||
assert(thr_data[i].numHits != NULL);
|
||||
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i])))
|
||||
{
|
||||
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// block until all threads complete
|
||||
//
|
||||
for (i = 0; i < num_threads; ++i)
|
||||
{
|
||||
pthread_join(thr[i], NULL);
|
||||
}
|
||||
|
||||
//
|
||||
// Compute absolute offsets for each kmer's tree from per-thread relative offsets
|
||||
//
|
||||
uint64_t *kmer_table = (uint64_t *)calloc(numKmers, sizeof(uint64_t));
|
||||
assert(kmer_table != NULL);
|
||||
int tidx;
|
||||
uint64_t kidx;
|
||||
uint64_t numProcessed = 0;
|
||||
uint64_t offset = 0;
|
||||
for (tidx = 0; tidx < num_threads; ++tidx)
|
||||
{
|
||||
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
|
||||
for (kidx = 0; kidx < numKmersToProcess; ++kidx)
|
||||
{
|
||||
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
|
||||
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
|
||||
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
|
||||
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
|
||||
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
|
||||
| (ptr_width << 22)
|
||||
| (reseed_hits << 17)
|
||||
| (kmer_data);
|
||||
}
|
||||
numProcessed += numKmersToProcess;
|
||||
offset += thr_data[tidx].end_offset;
|
||||
free(thr_data[tidx].kmer_table);
|
||||
free(thr_data[tidx].numHits);
|
||||
}
|
||||
|
||||
//
|
||||
// STEP 2 : Using estimates of each k-mer's tree size from the previous step, write the index to file
|
||||
//
|
||||
uint64_t total_size = offset + (numKmers * 8UL);
|
||||
if (bwa_verbose >= 3) {
|
||||
fprintf(stderr, "[M::%s] Total size of ERT index = %lu B (Expected). (k-mer,tree) = (%lu,%lu)\n", __func__, total_size, numKmers * 8UL, offset);
|
||||
}
|
||||
|
||||
// return;
|
||||
|
||||
for (i = 0; i < num_threads; ++i) {
|
||||
thr_data[i].tid = i;
|
||||
thr_data[i].step = 1;
|
||||
thr_data[i].readLength = readLength;
|
||||
thr_data[i].bid = bid;
|
||||
thr_data[i].startKmer = i*numKmersThread;
|
||||
thr_data[i].endKmer = ((i + 1)*numKmersThread > numKmers) ? numKmers : (i + 1)*numKmersThread;
|
||||
thr_data[i].end_offset = 0;
|
||||
thr_data[i].filePrefix = prefix;
|
||||
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
|
||||
thr_data[i].kmer_table = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
|
||||
thr_data[i].numHits = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
|
||||
thr_data[i].byte_offsets = kmer_table;
|
||||
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i]))) {
|
||||
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < num_threads; ++i) {
|
||||
pthread_join(thr[i], NULL);
|
||||
}
|
||||
|
||||
if (bwa_verbose >= 3) {
|
||||
fprintf(stderr, "[M::%s] Merging per-thread tables ...\n", __func__);
|
||||
}
|
||||
|
||||
//
|
||||
// Compute absolute offsets for each k-mer tree's root node
|
||||
//
|
||||
numProcessed = 0;
|
||||
offset = 0;
|
||||
for (tidx = 0; tidx < num_threads; ++tidx) {
|
||||
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
|
||||
for (kidx = 0; kidx < numKmersToProcess; ++kidx) {
|
||||
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
|
||||
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
|
||||
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
|
||||
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
|
||||
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
|
||||
| (ptr_width << 22)
|
||||
| (reseed_hits << 17)
|
||||
| (kmer_data);
|
||||
}
|
||||
numProcessed += numKmersToProcess;
|
||||
offset += thr_data[tidx].end_offset;
|
||||
free(thr_data[tidx].kmer_table);
|
||||
free(thr_data[tidx].numHits);
|
||||
}
|
||||
kmer_tbl_fd = fopen(kmer_tbl_file_name, "wb");
|
||||
if (kmer_tbl_fd == NULL) {
|
||||
fprintf(stderr, "[M::%s] Can't open file or file doesn't exist.\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
fwrite(kmer_table, sizeof(uint64_t), numKmers, kmer_tbl_fd);
|
||||
fclose(kmer_tbl_fd);
|
||||
free(kmer_table);
|
||||
|
||||
//
|
||||
// Merge all per-thread trees
|
||||
//
|
||||
const int file_buf_size = 64 * 1024 * 1024;
|
||||
uint8_t *file_buf = (uint8_t *)malloc(file_buf_size); // 64MB
|
||||
char ml_tbl_file_name[PATH_MAX] = {};
|
||||
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
|
||||
|
||||
if (remove(ml_tbl_file_name) == 0) {
|
||||
fprintf(stderr, "[M::%s] Overwriting existing index file (tree)\n", __func__);
|
||||
}
|
||||
|
||||
FILE *o_mlt = fopen(ml_tbl_file_name, "wb");
|
||||
if (o_mlt == NULL)
|
||||
{
|
||||
fprintf(stderr, "[M::%s] Can't open output index file for writing.\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
for (uint64_t tidx = 0; tidx < num_threads; ++tidx) {
|
||||
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%ld", prefix, tidx);
|
||||
FILE *i_mlt = fopen(ml_tbl_file_name, "rb");
|
||||
if (i_mlt == NULL) {
|
||||
fprintf(stderr, "[M::%s] Can't open per-thread index file for thread %ld\n", __func__, tidx);
|
||||
exit(1);
|
||||
}
|
||||
int fr = 0;
|
||||
while((fr = fread(file_buf, 1, file_buf_size, i_mlt)) != 0) {
|
||||
fwrite(file_buf, 1, fr, o_mlt);
|
||||
}
|
||||
if (remove(ml_tbl_file_name) != 0) {
|
||||
fprintf(stderr, "[M::%s] Can't remove per-thread index file (tree) for thread %ld\n", __func__, tidx);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
free(file_buf);
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
#ifndef BWA_ERT_H
|
||||
#define BWA_ERT_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "kvec.h"
|
||||
#include "bwa.h"
|
||||
|
||||
#define ERT_MAX_READ_LEN 301
|
||||
#define kmerSize 15
|
||||
#define numKmers 1073741824
|
||||
#define xmerSize 4
|
||||
#define numXmers 256
|
||||
#define TILE_SIZE 64
|
||||
// #define PRINT_SMEM
|
||||
#define PREFIX_LENGTH 3
|
||||
#define LEP_MASK 0x3FFF
|
||||
#define KMER_DATA_BITWIDTH 24
|
||||
#define KMER_DATA_MASK 0xFFFF
|
||||
#define METADATA_BITWIDTH 2
|
||||
#define METADATA_MASK 0x3
|
||||
#define INVALID 0
|
||||
#define SINGLE_HIT_LEAF 1
|
||||
#define INFREQUENT 2
|
||||
#define FREQUENT 3
|
||||
#define HIT_THRESHOLD 256
|
||||
#define DRAM_PAGE_SIZE 24576
|
||||
#define LEAF_TBL_BASE_PTR_WIDTH 3
|
||||
#define LEAF_TBL_HIT_COUNT_WIDTH 3
|
||||
#define MAX_HITS_PER_READ 2000000
|
||||
|
||||
typedef enum
|
||||
{
|
||||
EMPTY,
|
||||
LEAF,
|
||||
UNIFORM,
|
||||
DIVERGE
|
||||
} note_type_t;
|
||||
|
||||
typedef struct _node_t
|
||||
{
|
||||
note_type_t type;
|
||||
int pos;
|
||||
int num_bp;
|
||||
int l_seq;
|
||||
int numChildren;
|
||||
uint64_t numHits;
|
||||
uint64_t start_addr;
|
||||
uint64_t *hits;
|
||||
uint8_t seq[ERT_MAX_READ_LEN + 1];
|
||||
struct _node_t *parent_node;
|
||||
struct _node_t *child_nodes[4];
|
||||
} node_t;
|
||||
|
||||
typedef kvec_t(node_t) node_v;
|
||||
|
||||
typedef node_t *node_ptr_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int tid;
|
||||
int step;
|
||||
int readLength;
|
||||
uint64_t *kmer_table;
|
||||
uint64_t startKmer;
|
||||
uint64_t endKmer;
|
||||
bwaidx_t *bid;
|
||||
uint64_t *numHits;
|
||||
char *filePrefix;
|
||||
uint64_t *byte_offsets;
|
||||
uint64_t end_offset;
|
||||
} thread_data_t;
|
||||
|
||||
// FIXME : Add to options later
|
||||
extern const uint8_t char_count_size_in_bits;
|
||||
extern const uint8_t hits_count_size_in_bits;
|
||||
extern const uint8_t ref_ptr_size_in_bits;
|
||||
extern const uint8_t leaf_offset_ptr_size_in_bits;
|
||||
extern const uint8_t other_offset_ptr_size_in_bits;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CODE,
|
||||
EMPTY_NODE,
|
||||
LEAF_COUNT,
|
||||
LEAF_HITS,
|
||||
UNIFORM_COUNT,
|
||||
UNIFORM_BP,
|
||||
LEAF_PTR,
|
||||
OTHER_PTR
|
||||
} byte_type_t;
|
||||
|
||||
void ert_build_kmertree(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], int curDepth, node_t *parent_node, int step, int max_depth);
|
||||
|
||||
void handleDivergence(const bwaidx_t *bid, bwtintv_t ok[4], int depth, node_t *parent_node, int step, int max_depth);
|
||||
|
||||
void handleLeaf(const bwaidx_t *bid, bwtintv_t ik, node_t *n, int step);
|
||||
|
||||
void ert_build_table(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], uint8_t *mlt_data, uint8_t *mh_data, uint64_t *size, uint64_t *mh_size, uint8_t *aq, uint64_t *numHits, uint64_t *max_next_ptr, uint64_t next_ptr_width, int step, int max_depth);
|
||||
|
||||
void ert_traverse_kmertree(node_t *n, uint8_t *mlt_data, uint8_t *mh_data, uint64_t *byte_idx, uint64_t *mh_byte_idx, int depth, uint64_t *numHits, uint64_t *max_ptr, uint64_t next_ptr_width, int step);
|
||||
|
||||
void ert_destroy_kmertree(node_t *n);
|
||||
|
||||
void buildERTKmerTrees(char *kmer_tbl_file_name, bwaidx_t *bid, char *prefix, int num_threads, int readLength);
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,126 @@
|
|||
#ifndef ERTSEEDING_HPP
|
||||
#define ERTSEEDING_HPP
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include "kstring.h"
|
||||
#include "ksw.h"
|
||||
#include "kvec.h"
|
||||
#include "ksort.h"
|
||||
#include "utils.h"
|
||||
#include "bwt.h"
|
||||
#include "ertindex.h"
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "profiling.h"
|
||||
#include "utils.h"
|
||||
|
||||
|
||||
/**
|
||||
* Node state to keep track of while traversing ERT.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int num_hits;
|
||||
uint64_t byte_idx;
|
||||
} node_info_t;
|
||||
|
||||
typedef kvec_t(uint64_t) u64v;
|
||||
typedef kvec_t(uint8_t) u8v;
|
||||
typedef kvec_t(int) intv;
|
||||
|
||||
typedef kvec_t(node_info_t) path_v;
|
||||
|
||||
/**
|
||||
* State to keep track of previous pivots for each new MEM search
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int c_pivot; // Pivot used to generate the SMEM
|
||||
int p_pivot; // Previous pivot
|
||||
int pp_pivot; // Pivot before the previous pivot. Useful in reseeding
|
||||
} pivot_t;
|
||||
|
||||
/**
|
||||
* State for each maximal-exact-match (MEM)
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint8_t forward; // RMEM or LMEM. We need this to normalize hit positions
|
||||
int start; // MEM start position in read
|
||||
int end; // MEM end position in read. [start, end)
|
||||
int rc_start; // MEM start position in reverse complemented (RC) read (used for backward search)
|
||||
int rc_end; // MEM end position in reverse complemented (RC) read (used for backward search)
|
||||
int skip_ref_fetch; // Skip reference fetch when leaf node need not be decompressed
|
||||
int fetch_leaves; // Gather all leaves for MEM
|
||||
int hitbeg; // Index into hit array
|
||||
int hitcount; // Count of hits
|
||||
int end_correction; // Amount by which MEM has extended beyond backward search start position in read
|
||||
int is_multi_hit;
|
||||
pivot_t pt;
|
||||
} mem_t;
|
||||
|
||||
typedef kvec_t(mem_t) mem_v;
|
||||
|
||||
/**
|
||||
* Index-related auxiliary data structures
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t *kmer_offsets; // K-mer table
|
||||
uint8_t *mlt_table; // Multi-level ERT
|
||||
const bwt_t *bwt; // FM-index
|
||||
const bntseq_t *bns; // Input reads sequences
|
||||
const uint8_t *pac; // Reference genome (2-bit encoded)
|
||||
uint8_t *ref_string;
|
||||
} index_aux_t;
|
||||
|
||||
/**
|
||||
* 'Read' auxiliary data structures
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int min_seed_len; // Minimum length of seed
|
||||
int l_seq; // Read length
|
||||
int ptr_width; // Size of pointers to child nodes in ERT
|
||||
int num_hits; // Number of hits for each node in the ERT
|
||||
int limit; // Number of hits after which extension must be stopped
|
||||
uint64_t lep[5]; // FIXME: Can support up to 320bp
|
||||
uint64_t nextLEPBit; // Index into the LEP bit-vector
|
||||
uint64_t mlt_start_addr; // Start address of multi-level ERT
|
||||
uint64_t mh_start_addr; // Start address of multi-hits for each k-mer
|
||||
char *read_name; // Read name
|
||||
uint8_t *unpacked_queue_buf; // Read sequence (2-bit encoded)
|
||||
uint8_t *unpacked_rc_queue_buf; // Reverse complemented read (2-bit encoded)
|
||||
uint8_t *read_buf; // == queue_buf (forward) and == rc_queue_buf (backward)
|
||||
} read_aux_t;
|
||||
|
||||
/**
|
||||
* SMEM helper data structure
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int prevMemStart; // Start position of previous MEM in the read
|
||||
int prevMemEnd; // End position of previous MEM in the read
|
||||
int curr_pivot; // Pivot used for forward/backward search in iteration i
|
||||
int prev_pivot; // Pivot used in the previous iteration (i-1)
|
||||
int prev_prev_pivot; // Pivot used in iteration i-2 (useful for reseeding)
|
||||
int stop_be; // Stop backward search early if no new SMEMs can be found for pivot
|
||||
int mem_end_limit;
|
||||
} smem_helper_t;
|
||||
|
||||
void get_seeds(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, u64v *hits);
|
||||
|
||||
void get_seeds_prefix(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, u64v *hits);
|
||||
|
||||
void reseed(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, int start, int limit, pivot_t *pt, u64v *hits);
|
||||
|
||||
void reseed_prefix(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, int start, int limit, pivot_t *pt, u64v *hits);
|
||||
|
||||
void last(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, int limit, u64v *hits);
|
||||
|
||||
#endif
|
||||
23
fastmap.c
23
fastmap.c
|
|
@ -78,6 +78,7 @@ typedef struct {
|
|||
long read_idx;
|
||||
long calc_idx;
|
||||
long write_idx;
|
||||
int useERT;
|
||||
} ktp_aux_t;
|
||||
|
||||
// read
|
||||
|
|
@ -370,6 +371,7 @@ int main_mem(int argc, char *argv[])
|
|||
void *ko = 0, *ko2 = 0;
|
||||
mem_pestat_t pes[4];
|
||||
ktp_aux_t aux;
|
||||
int useERT = 0;
|
||||
|
||||
memset(&aux, 0, sizeof(ktp_aux_t));
|
||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||
|
|
@ -377,7 +379,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
aux.opt = opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "512qpaMCSPVYjuk:c:v:s:r:t:b:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "512qpaMCSPVYjuk:c:v:s:r:t:b:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:Z")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == '1') no_mt_io = 1;
|
||||
else if (c == '2') no_mt_io = 2;
|
||||
|
|
@ -478,6 +480,9 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
|
||||
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
|
||||
}
|
||||
else if (c == 'Z') {
|
||||
useERT = 1;
|
||||
}
|
||||
else return 1;
|
||||
}
|
||||
|
||||
|
|
@ -548,6 +553,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
|
||||
fprintf(stderr, " FR orientation only. [inferred]\n");
|
||||
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
|
||||
fprintf(stderr, " -Z Use ERT index for seeding\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
|
@ -589,9 +595,16 @@ int main_mem(int argc, char *argv[])
|
|||
PROF_END(gprof[G_PREPARE], prepare);
|
||||
|
||||
PROF_START(idx);
|
||||
aux.idx = bwa_idx_load_from_shm(argv[optind]);
|
||||
if (useERT) aux.idx = bwa_ertidx_load_from_shm(argv[optind]);
|
||||
else aux.idx = bwa_fmtidx_load_from_shm(argv[optind]);
|
||||
if (aux.idx == 0) {
|
||||
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
if (!useERT) {
|
||||
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC | BWA_IDX_FMT)) == 0) return 1; // FIXME: memory leak
|
||||
}
|
||||
else {
|
||||
if ((aux.idx = bwa_ertidx_load_from_disk(argv[optind])) == 0) return 1; // FIXME: memory leak
|
||||
}
|
||||
|
||||
} else if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
|
||||
if (ignore_alt)
|
||||
|
|
@ -623,7 +636,7 @@ int main_mem(int argc, char *argv[])
|
|||
}
|
||||
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %ld\n", aux.idx->fmt->L2[0], aux.idx->fmt->L2[1], aux.idx->fmt->L2[2], aux.idx->fmt->L2[3], aux.idx->fmt->L2[4]);
|
||||
//exit(0);
|
||||
aux.useERT = useERT;
|
||||
aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->bns, aux.idx->pac);
|
||||
aux.data = calloc(2, sizeof(ktp_data_t));
|
||||
|
||||
|
|
@ -635,8 +648,10 @@ int main_mem(int argc, char *argv[])
|
|||
aux.wbuf = malloc(aux.wbuf_size);
|
||||
|
||||
PROF_START(pipeline);
|
||||
|
||||
if (no_mt_io == 2) new_pipeline(&aux);
|
||||
else kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
|
||||
|
||||
PROF_END(gprof[G_PIPELINE], pipeline);
|
||||
|
||||
// free(hdr_line);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
#include "utils.h"
|
||||
#include "ksw.h"
|
||||
|
||||
static const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
|
||||
// static const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
typedef struct _kswq_t {
|
||||
int qlen, slen;
|
||||
|
|
|
|||
3
main.c
3
main.c
|
|
@ -40,6 +40,7 @@ int bwa_bwt2sa(int argc, char *argv[]);
|
|||
int bwa_bwt2bytesa(int argc, char *argv[]);
|
||||
int bwa_bwt2fmt(int argc, char *argv[]);
|
||||
int bwa_build_kmer(int argc, char *argv[]);
|
||||
int bwa_bwt2ert(int argc, char *argv[]);
|
||||
int bwa_index(int argc, char *argv[]);
|
||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||
|
||||
|
|
@ -82,6 +83,7 @@ static int usage()
|
|||
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2fmt generate FMT-Index from BWT\n");
|
||||
fprintf(stderr, " buildkmer generate kmer index from FMT\n");
|
||||
fprintf(stderr, " bwt2ert generate ert index from BWT\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr,
|
||||
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
|
||||
|
|
@ -110,6 +112,7 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2fmt") == 0) ret = bwa_bwt2fmt(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "buildkmer") == 0) ret = bwa_build_kmer(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "bwt2ert") == 0) ret = bwa_bwt2ert(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
||||
|
|
|
|||
2
run.sh
2
run.sh
|
|
@ -44,7 +44,7 @@ out=~/data1/fmt-out.sam
|
|||
#out=/dev/null
|
||||
|
||||
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
$reference \
|
||||
-Z $reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
-o $out -2
|
||||
|
|
|
|||
18
utils.c
18
utils.c
|
|
@ -327,3 +327,21 @@ long peakrss(void)
|
|||
return r.ru_maxrss;
|
||||
#endif
|
||||
}
|
||||
|
||||
int memcpy_bwamem(void *dest, size_t dmax, const void *src, size_t smax, char *file_name, int line_num)
|
||||
{
|
||||
#define RSIZE_MAX_MEM (256UL << 20) /* 256MB */
|
||||
if (dmax < smax)
|
||||
{
|
||||
fprintf(stderr, "[%s: %d] src size is lager than dest size.(src: %ld; dest: %ld)\n", file_name, line_num, smax, dmax);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
int64_t bytes_copied;
|
||||
for (bytes_copied = 0; bytes_copied < smax; bytes_copied += RSIZE_MAX_MEM)
|
||||
{
|
||||
int64_t bytes_remaining = smax - bytes_copied;
|
||||
int64_t bytes_to_copy = (bytes_remaining > RSIZE_MAX_MEM) ? RSIZE_MAX_MEM : bytes_remaining;
|
||||
memcpy((char *)dest + bytes_copied, (const char *)src + bytes_copied, bytes_to_copy);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
6
utils.h
6
utils.h
|
|
@ -80,6 +80,10 @@ static inline unsigned long long __rdtsc(void)
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#define log_file(fd, M, ...) \
|
||||
fprintf(fd, M "\n", ##__VA_ARGS__); \
|
||||
fflush(fd)
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} pair64_t;
|
||||
|
|
@ -129,6 +133,8 @@ extern "C" {
|
|||
void ks_introsort_64 (size_t n, uint64_t *a);
|
||||
void ks_introsort_128(size_t n, pair64_t *a);
|
||||
|
||||
int memcpy_bwamem(void *dest, size_t dmax, const void *src, size_t smax, char *file_name, int line_num);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue