添加了共享内存支持,成功构建ert索引

This commit is contained in:
zzh 2024-08-03 17:26:32 +08:00
parent 9c9502d584
commit 364ee9756e
19 changed files with 5170 additions and 38 deletions

12
.vscode/launch.json vendored
View File

@ -49,6 +49,18 @@
"~/data/reference/human_g1k_v37_decoy.fasta.kmer"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "share mem",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/fastbwa",
"args": [
"shm",
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
}
]
}

View File

@ -35,6 +35,11 @@
"bitset": "c",
"iterator": "c",
"memory": "c",
"__locale": "c"
"__locale": "c",
"stdint.h": "c",
"bntseq.h": "c",
"inttypes.h": "c",
"ertindex.h": "c",
"ertseeding.h": "c"
}
}

View File

@ -1,6 +1,4 @@
CC= gcc
#CC= clang --analyze
# CFLAGS= -g -Wall -Wno-unused-function -O2
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
@ -18,7 +16,7 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o profiling.o \
fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o \
debug.o
debug.o ertindex.o ertseeding.o
PROG= fastbwa
INCLUDES=
LIBS= -lm -lz -lpthread -ldl

3
build_ert.sh 100755
View File

@ -0,0 +1,3 @@
#!/bin/bash
time ./fastbwa bwt2ert ~/data1/fmt_ref/human_g1k_v37_decoy.fasta -t 64
#time ./fastbwa bwt2ert ~/reference/human_g1k_v37_decoy.fasta -t 64

274
bwa.c
View File

@ -30,18 +30,20 @@
#include <assert.h>
#include <unistd.h>
#include <pthread.h>
#include <limits.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
#include "utils.h"
#include "kstring.h"
#include "kvec.h"
#include "ertindex.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int bwa_verbose = 3;
int bwa_verbose = 4;
int bwa_dbg = 0;
char bwa_rg_id[256];
char *bwa_pg;
@ -486,6 +488,87 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
return fmt;
}
bwaidx_t *bwa_ertidx_load_from_disk(const char *hint)
{
bwaidx_t *idx = 0;
int i, c;
const char *prefix = hint;
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
idx = calloc(1, sizeof(bwaidx_t));
idx->bns = bns_restore(prefix);
for (i = c = 0; i < idx->bns->n_seqs; ++i)
if (idx->bns->anns[i].is_alt) ++c;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
err_fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
// load ert
{
char kmer_tbl_file_name[PATH_MAX];
char ml_tbl_file_name[PATH_MAX];
sprintf(kmer_tbl_file_name, "%s.ert.kmer.table", prefix);
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
FILE *kmer_tbl_fd, *ml_tbl_fd;
kmer_tbl_fd = fopen(kmer_tbl_file_name, "rb");
if (kmer_tbl_fd == NULL) {
fprintf(stderr, "[M::%s::ERT] Can't open k-mer index\n.", __func__);
exit(1);
}
ml_tbl_fd = fopen(ml_tbl_file_name, "rb");
if (ml_tbl_fd == NULL) {
fprintf(stderr, "[M::%s::ERT] Can't open multi-level tree index\n.", __func__);
exit(1);
}
double ctime, rtime;
ctime = cputime();
rtime = realtime();
int64_t allocMem = numKmers * 8L;
//
// Read k-mer index
//
idx->ert = (ERT *)calloc(1, sizeof(ERT));
idx->ert->kmer_size = numKmers * sizeof(uint64_t);
idx->ert->kmer_offsets = (uint64_t *)malloc(idx->ert->kmer_size);
assert(idx->ert->kmer_offsets != NULL);
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s::ERT] Reading kmer index to memory\n", __func__);
}
err_fread_noeof(idx->ert->kmer_offsets, sizeof(uint64_t), numKmers, kmer_tbl_fd);
//
// Read multi-level tree index
//
fseek(ml_tbl_fd, 0L, SEEK_END);
long size = ftell(ml_tbl_fd);
idx->ert->mlt_size = size;
allocMem += size;
idx->ert->mlt_table = (uint8_t *)malloc(size * sizeof(uint8_t));
assert(idx->ert->mlt_table != NULL);
fseek(ml_tbl_fd, 0L, SEEK_SET);
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s::ERT] Reading multi-level tree index to memory\n", __func__);
}
err_fread_noeof(idx->ert->mlt_table, sizeof(uint8_t), size, ml_tbl_fd);
fclose(kmer_tbl_fd);
fclose(ml_tbl_fd);
if (bwa_verbose >= 3)
{
fprintf(stderr, "[M::%s::ERT] Index tables loaded in %.3f CPU sec, %.3f real sec...\n", __func__, cputime() - ctime, realtime() - rtime);
}
}
return idx;
}
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
{
bwaidx_t *idx;
@ -497,9 +580,14 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
}
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint);
//if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
//idx->bwt->kmer_hash = idx->fmt->kmer_hash;
if (which & BWA_IDX_FMT) idx->fmt = bwa_idx_load_fmt(hint);
if (which & BWA_IDX_BWT)
{
idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_FMT)
idx->bwt->kmer_hash = idx->fmt->kmer_hash;
}
if (which & BWA_IDX_BNS)
{
@ -515,10 +603,12 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
err_fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
// 赋值到fmt中对应的pac
if (which & BWA_IDX_FMT) {
idx->fmt->l_pac = idx->bns->l_pac;
idx->fmt->pac = idx->pac;
}
}
}
free(prefix);
return idx;
}
@ -550,7 +640,7 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
@ -567,6 +657,176 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
return 0;
}
static void mem_to_bnspac(bwaidx_t *idx, uint8_t **mem_, int64_t *k_)
{
int i;
int64_t x, k = *k_;
uint8_t *mem = *mem_;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
}
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
*k_ = k;
*mem_ = mem;
}
int bwa_mem2fmtidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
// generate idx->fmt
x = sizeof(FMTIndex); idx->fmt = malloc(x); memcpy(idx->fmt, mem + k, x); k += x;
x = idx->fmt->bwt_size * 4; idx->fmt->bwt = (uint32_t*)(mem + k); k += x;
x = SA_BYTES(idx->fmt->n_sa); idx->fmt->sa = (uint8_t*)(mem + k); k += x;
// kmer hash
x = (1 << (10 << 1)) * sizeof(KmerEntryArr);
idx->fmt->kmer_hash.ke10 = (KmerEntryArr*)(mem + k); k += x;
x = (1 << (11 << 1)) * sizeof(KmerEntry);
idx->fmt->kmer_hash.ke11 = (KmerEntry*)(mem + k); k += x;
x = (1 << (12 << 1)) * sizeof(KmerEntry);
idx->fmt->kmer_hash.ke12 = (KmerEntry*)(mem + k); k += x;
#if HASH_KMER_LEN > 12
x = (1 << (13 << 1)) * sizeof(KmerEntry);
idx->fmt->kmer_hash.ke13 = (KmerEntry*)(mem + k); k += x;
#endif
#if HASH_KMER_LEN > 13
x = (1 << (14 << 1)) * sizeof(KmerEntry);
idx->fmt->kmer_hash.ke14 = (KmerEntry*)(mem + k); k += x;
#endif
// generate idx->bns and idx->pac
mem_to_bnspac(idx, &mem, &k);
assert(k == l_mem);
idx->l_mem = k; idx->mem = mem;
idx->fmt->pac = idx->pac;
return 0;
}
int bwa_mem2ertidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
// generate idx->ert
x = sizeof(ERT); idx->ert = malloc(x); memcpy(idx->ert, mem + k, x); k += x;
x = idx->ert->mlt_size; idx->ert->mlt_table = (uint8_t *)(mem + k); k += x;
x = idx->ert->kmer_size; idx->ert->kmer_offsets = (uint64_t *)(mem + k); k += x;
// generate idx->bns and idx->pac
mem_to_bnspac(idx, &mem, &k);
assert(k == l_mem);
idx->l_mem = k;
idx->mem = mem;
return 0;
}
static void move_bns_to_mem(bwaidx_t *idx, uint8_t **mem_, int64_t *k_)
{
int i;
int64_t x, tmp, k=*k_;
uint8_t *mem = *mem_;
// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
free(idx->bns->ambs);
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
}
free(idx->bns->anns);
*k_ = k;
*mem_ = mem;
}
static void move_pac_to_mem(bwaidx_t *idx, uint8_t **mem_, int64_t *k_)
{
int64_t x, k = *k_;
uint8_t *mem = *mem_;
// copy idx->pac
x = idx->bns->l_pac/4+1;
mem = realloc(mem, k + x);
memcpy(mem + k, idx->pac, x); k += x;
free(idx->bns); idx->bns = 0;
free(idx->pac); idx->pac = 0;
*k_ = k;
*mem_ = mem;
}
int bwa_ertidx2mem(bwaidx_t *idx)
{
int64_t x, k;
uint8_t *mem;
// copy idx->ert
x = idx->ert->mlt_size;
mem = realloc(idx->ert->mlt_table, sizeof(ERT) + x); idx->ert->mlt_table = 0;
memmove(mem + sizeof(ERT), mem, x);
memcpy(mem, idx->ert, sizeof(ERT)); k = sizeof(ERT) + x;
x = idx->ert->kmer_size;
mem = realloc(mem, k + x); memcpy(mem + k, idx->ert->kmer_offsets, x); k += x;
free(idx->ert->kmer_offsets); idx->ert->kmer_offsets = 0;
free(idx->ert);
idx->ert = 0;
// copy idx->bns
move_bns_to_mem(idx, &mem, &k);
// copy idx->pac
move_pac_to_mem(idx, &mem, &k);
return bwa_mem2ertidx(k, mem, idx);
}
int bwa_fmtidx2mem(bwaidx_t *idx)
{
int64_t k, x;
uint8_t *mem;
// copy idx->fmt
x = idx->fmt->bwt_size * 4;
mem = realloc(idx->fmt->bwt, sizeof(FMTIndex) + x); idx->fmt->bwt = 0;
memmove(mem + sizeof(FMTIndex), mem, x);
memcpy(mem, idx->fmt, sizeof(FMTIndex)); k = sizeof(FMTIndex) + x;
x = SA_BYTES(idx->fmt->n_sa); mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->sa, x); k += x;
// kmer hash
x = (1 << (10 << 1)) * sizeof(KmerEntryArr);
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke10, x); k += x;
x = (1 << (11 << 1)) * sizeof(KmerEntry);
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke11, x); k += x;
x = (1 << (12 << 1)) * sizeof(KmerEntry);
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke12, x); k += x;
#if HASH_KMER_LEN > 12
x = (1 << (13 << 1)) * sizeof(KmerEntry);
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke13, x); k += x;
#endif
#if HASH_KMER_LEN > 13
x = (1 << (14 << 1)) * sizeof(KmerEntry);
mem = realloc(mem, k + x); memcpy(mem + k, idx->fmt->kmer_hash.ke14, x); k += x;
#endif
free(idx->fmt->kmer_hash.ke10);
free(idx->fmt->kmer_hash.ke11);
free(idx->fmt->kmer_hash.ke12);
free(idx->fmt->kmer_hash.ke13);
free(idx->fmt->kmer_hash.ke14);
free(idx->fmt->sa);
free(idx->fmt); idx->fmt = 0;
// copy idx->bns
move_bns_to_mem(idx, &mem, &k);
// copy idx->pac
move_pac_to_mem(idx, &mem, &k);
return bwa_mem2fmtidx(k, mem, idx);
}
int bwa_idx2mem(bwaidx_t *idx)
{
int i;
@ -574,14 +834,16 @@ int bwa_idx2mem(bwaidx_t *idx)
uint8_t *mem;
// copy idx->bwt
x = idx->bwt->bwt_size * 4;
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
memmove(mem + sizeof(bwt_t), mem, x);
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
x = SA_BYTES(idx->bwt->n_sa); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
free(idx->bwt->sa);
free(idx->bwt); idx->bwt = 0;
// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory

18
bwa.h
View File

@ -36,7 +36,8 @@
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
#define BWA_IDX_FMT 0x8
#define BWA_IDX_ALL 0xF
#define BWA_CTL_SIZE 0x10000
@ -47,11 +48,19 @@
#define BWA_DBG_QNAME 0x1
typedef struct {
uint64_t *kmer_offsets; // ert kmer
uint8_t *mlt_table; // ert mlt
uint64_t kmer_size;
uint64_t mlt_size;
} ERT;
typedef struct {
bwt_t *bwt; // FM-index
FMTIndex *fmt;// FMT-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
ERT *ert;
int is_shm;
int64_t l_mem;
@ -91,10 +100,17 @@ extern "C" {
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
bwaidx_t *bwa_fmtidx_load_from_shm(const char *hint);
bwaidx_t *bwa_ertidx_load_from_shm(const char *hint);
bwaidx_t *bwa_ertidx_load_from_disk(const char *hint);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_fmtidx2mem(bwaidx_t *idx);
int bwa_ertidx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
int bwa_mem2fmtidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
int bwa_mem2ertidx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
char *bwa_set_rg(const char *s);

View File

@ -9,8 +9,9 @@
#include <stdio.h>
#include "bwa.h"
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
int bwa_shm_stage(bwaidx_t *idx, const char *hint, int useERT, const char *_tmpfn)
{
const char *name;
uint8_t *shm, *shm_idx;
uint16_t *cnt;
@ -26,7 +27,7 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
to_init = 1;
}
if (shmid < 0) return -1;
ftruncate(shmid, BWA_CTL_SIZE);
if (ftruncate(shmid, BWA_CTL_SIZE) < 0) return -1;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
if (to_init) {
@ -34,7 +35,11 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
cnt[1] = 4;
}
if (idx->mem == 0) bwa_idx2mem(idx);
// 将所有索引数据放在连续的内存空间里
if (idx->mem == 0) {
if (useERT) bwa_ertidx2mem(idx);
else bwa_fmtidx2mem(idx);
}
if (tmpfn) {
FILE *fp;
@ -63,7 +68,7 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
memcpy(shm + cnt[1], &idx->l_mem, 8);
memcpy(shm + cnt[1] + 8, name, l - 8);
cnt[1] += l; ++cnt[0];
ftruncate(shmid, idx->l_mem);
if (ftruncate(shmid, idx->l_mem) < 0) return -1;
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
if (tmpfn) {
FILE *fp;
@ -79,11 +84,58 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
memcpy(shm_idx, idx->mem, idx->l_mem);
free(idx->mem);
}
bwa_mem2idx(idx->l_mem, shm_idx, idx);
if (useERT)
bwa_mem2ertidx(idx->l_mem, shm_idx, idx);
else
bwa_mem2fmtidx(idx->l_mem, shm_idx, idx);
idx->is_shm = 1;
return 0;
}
#define INIT_SHM_LOAD \
const char *name; \
uint8_t *shm, *shm_idx; \
uint16_t *cnt, i; \
char *p, path[PATH_MAX + 1]; \
int shmid; \
int64_t l_mem; \
bwaidx_t *idx; \
if (hint == 0 || hint[0] == 0) return 0; \
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); \
++name; \
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0; \
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); \
cnt = (uint16_t*)shm; \
if (cnt[0] == 0) return 0; \
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) { \
memcpy(&l_mem, p, 8); p += 8; \
if (strcmp(p, name) == 0) break; \
p += strlen(p) + 1; \
} \
if (i == cnt[0]) return 0; \
strcat(strcpy(path, "/bwaidx-"), name); \
if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0; \
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0); \
idx = calloc(1, sizeof(bwaidx_t));
bwaidx_t *bwa_ertidx_load_from_shm(const char *hint_)
{
char hint[PATH_MAX];
sprintf(hint, "%s.ert", hint_);
INIT_SHM_LOAD;
bwa_mem2ertidx(l_mem, shm_idx, idx);
idx->is_shm = 1;
return idx;
}
bwaidx_t *bwa_fmtidx_load_from_shm(const char *hint)
{
INIT_SHM_LOAD;
bwa_mem2fmtidx(l_mem, shm_idx, idx);
idx->is_shm = 1;
return idx;
}
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
{
const char *name;
@ -178,18 +230,22 @@ int bwa_shm_destroy(void)
int main_shm(int argc, char *argv[])
{
int c, to_list = 0, to_drop = 0, ret = 0;
int c, to_list = 0, to_drop = 0, ret = 0, useERT = 0;
char *tmpfn = 0;
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
char shm_prefix[PATH_MAX];
while ((c = getopt(argc, argv, "ldf:Z")) >= 0)
{
if (c == 'l') to_list = 1;
else if (c == 'd') to_drop = 1;
else if (c == 'f') tmpfn = optarg;
else if (c == 'Z') useERT = 1;
}
if (optind == argc && !to_list && !to_drop) {
fprintf(stderr, "\nUsage: fastbwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
fprintf(stderr, " -l list names of indices in shared memory\n");
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n");
fprintf(stderr, " -Z use Ert as seeding index\n\n");
return 1;
}
if (optind < argc && (to_list || to_drop)) {
@ -197,15 +253,25 @@ int main_shm(int argc, char *argv[])
return 1;
}
if (optind < argc) {
if (bwa_shm_test(argv[optind]) == 0) {
if (useERT)
sprintf(shm_prefix, "%s.ert", argv[optind]);
else
sprintf(shm_prefix, "%s", argv[optind]);
if (bwa_shm_test(shm_prefix) == 0)
{
bwaidx_t *idx;
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
if (useERT)
idx = bwa_ertidx_load_from_disk(argv[optind]);
else
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC | BWA_IDX_FMT);
if (bwa_shm_stage(idx, shm_prefix, useERT, tmpfn) < 0) {
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
ret = 1;
}
bwa_idx_destroy(idx);
} else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
}
else
fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
}
if (to_list) bwa_shm_list();
if (to_drop) bwa_shm_destroy();

View File

@ -876,7 +876,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r
bgint_t i, c;
bgint_t s, r;
bgint_t lastRank, lastIndex;
bgint_t oldInverseSa0RelativeRank = 0;
// bgint_t oldInverseSa0RelativeRank = 0;
bgint_t freq;
lastIndex = numItem;
@ -887,7 +887,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r
s = seq[numItem];
relativeRank[s] = numItem;
if (lastRank == oldInverseSa0) {
oldInverseSa0RelativeRank = numItem;
// oldInverseSa0RelativeRank = numItem;
oldInverseSa0++; // so that this segment of code is not run again
lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
}
@ -920,7 +920,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r
lastRank = r;
relativeRank[s] = i;
if (r == oldInverseSa0) {
oldInverseSa0RelativeRank = i;
// oldInverseSa0RelativeRank = i;
oldInverseSa0++; // so that this segment of code is not run again
lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
}
@ -950,14 +950,14 @@ static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank,
static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt)
{
unsigned int bitsInWordMinusBitPerChar;
// unsigned int bitsInWordMinusBitPerChar;
bgint_t leftShift, rightShift;
bgint_t o;
bgint_t oIndex, iIndex, mIndex;
bgint_t mWord, mChar, oWord, oChar;
bgint_t numInsert;
bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
// bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
oIndex = 0;
iIndex = 0;

View File

@ -36,6 +36,7 @@
#include "utils.h"
#include "rle.h"
#include "rope.h"
#include "ertindex.h"
#ifdef _DIVBWT
#include "divsufsort.h"
@ -258,8 +259,41 @@ int bwa_build_kmer(int argc, char *argv[])
return 0;
}
int bwa_bwt2ert(int argc, char *argv[])
{
char prefix[1024];
char ert_kmer_file[1124];
int c, num_threads = 1;
while ((c = getopt(argc, argv, "t:")) >= 0)
{
switch (c)
{
case 't':
num_threads = atoi(optarg);
break;
default:
return 1;
}
}
if (optind + 1 > argc)
{
fprintf(stderr, "Usage: fastbwa bwt2ert <index_prefix>\n");
return 1;
}
// fprintf(stderr, "%d %d %d\n", optind, argc, num_threads);
sprintf(prefix, "%s", argv[optind]);
sprintf(ert_kmer_file, "%s.%s", prefix, "ert.kmer.table");
fprintf(stderr, "%s\n", ert_kmer_file);
// Load BWT index
bwaidx_t *bid = bwa_idx_load_from_disk(prefix, BWA_IDX_BNS | BWA_IDX_BWT | BWA_IDX_PAC);
// Build ERT
buildERTKmerTrees(ert_kmer_file, bid, prefix, num_threads, ERT_MAX_READ_LEN);
return 0;
}
int bwa_index(int argc, char *argv[]) // the "index" command
{
int num_threads = 1;
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
char *prefix = 0, *str;
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
@ -272,6 +306,10 @@ int bwa_index(int argc, char *argv[]) // the "index" command
break;
case 'p': prefix = strdup(optarg); break;
case '6': is_64 = 1; break;
case 't':
num_threads = atoi(optarg);
assert(num_threads > 0 && num_threads < 256);
break;
case 'b':
block_size = strtol(optarg, &str, 10);
if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
@ -287,6 +325,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
fprintf(stderr, "Usage: fastbwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -t INT number of threads for ERT index building [%d]\n", num_threads);
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
fprintf(stderr, "\n");

946
ertindex.c 100644
View File

@ -0,0 +1,946 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdint.h>
#include <limits.h>
#include <pthread.h>
#include <errno.h>
#include <math.h>
#include "utils.h"
#include "ertindex.h"
#define _set_pac(pac, l, c) ((pac)[(l) >> 2] |= (c) << (((l) & 3) << 1))
#define _set_pac_orig(pac, l, c) ((pac)[(l) >> 2] |= (c) << ((~(l) & 3) << 1))
const uint8_t char_count_size_in_bits = 8;
const uint8_t hits_count_size_in_bits = 8;
const uint8_t ref_ptr_size_in_bits = 40;
const uint8_t leaf_offset_ptr_size_in_bits = 8;
const uint8_t other_offset_ptr_size_in_bits = 32;
// int test_max_intv = 0;
static inline void getNumBranchesForKmer(bwtintv_t ok[4], int* numBranches, uint8_t* uniform_bp) {
uint8_t i;
for (i = 0; i < 4; ++i) {
if (ok[i].x[2] > 0) { *numBranches += 1; *uniform_bp = i; }
}
}
// 从右向左的顺序将kmer转换成query
static inline void kmertoquery(uint64_t x, uint8_t *a, int l)
{
int i;
for (i = 0; i < l; ++i) {
a[i] = (uint8_t)((x >> (i << 1)) & 0x3);
}
}
static inline uint64_t addBytesForEntry(byte_type_t type, int count, int numHits)
{
uint64_t numBits = 0;
switch(type) {
case CODE:
numBits = 8;
break;
case LEAF_COUNT:
numBits = (hits_count_size_in_bits);
break;
case LEAF_HITS:
numBits = (ref_ptr_size_in_bits * numHits);
break;
case UNIFORM_COUNT: // "Uniform"
numBits = (char_count_size_in_bits);
break;
case UNIFORM_BP:
numBits = (count << 1);
break;
case LEAF_PTR: // "Leaf Offset Pointer"
numBits = leaf_offset_ptr_size_in_bits;
break;
case OTHER_PTR:
numBits = other_offset_ptr_size_in_bits;
break;
case EMPTY_NODE:
numBits = 0;
break;
default :
break;
}
return (numBits % 8) ? (numBits / 8 + 1) : numBits / 8;
}
void addChildNode(node_t* p, node_t* c) {
assert(p->numChildren <= 3);
p->child_nodes[p->numChildren++] = c;
c->parent_node = p;
}
void handleLeaf(const bwaidx_t *bid, bwtintv_t ik, node_t *n, int step)
{
n->type = LEAF;
n->numHits = ik.x[2];
n->hits = (uint64_t *)calloc(n->numHits, sizeof(uint64_t));
assert(n->hits != NULL);
if (step == 1) {
uint64_t ref_pos = 0;
int j = 0;
for (j = 0; j < n->numHits; ++j) {
ref_pos = fmt_sa(bid->fmt, ik.x[0] + j);
// ref_pos = bwt_sa(bid->bwt, ik.x[0] + j);
n->hits[j] = ref_pos;
}
}
}
void handleDivergence(const bwaidx_t *bid, bwtintv_t ok[4], int depth, node_t *parent_node, int step, int max_depth)
{
int i;
bwtintv_t ok_copy[4];
bwtintv_t ik_new;
memcpy_bwamem(ok_copy, 4 * sizeof(bwtintv_t), ok, 4 * sizeof(bwtintv_t), __FILE__, __LINE__);
for (i = 3; i >= 0; --i) {
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
n->pos = 0;
n->num_bp = 0;
if (ok_copy[i].x[2] == 0) { //!< Empty node
n->type = EMPTY;
n->numHits = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, (depth - 1) * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq = depth;
addChildNode(parent_node, n);
} else if (ok_copy[i].x[2] > 1 && depth != max_depth) { // 没到最大长度,可以继续扩展
ik_new = ok_copy[i]; ik_new.info = depth + 1;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
assert(depth >= 0);
n->seq[depth] = i;
n->pos = depth;
n->num_bp = 1;
n->l_seq = depth + 1;
n->numHits = ok_copy[i].x[2];
n->type = DIVERGE;
addChildNode(parent_node, n);
ert_build_kmertree(bid, ik_new, ok, depth + 1, n, step, max_depth);
} else { // leaf intv.x[2] == 1 or depth = max_depth
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
n->seq[depth] = i;
n->pos = depth;
n->num_bp = 1;
n->l_seq = depth + 1;
handleLeaf(bid, ok_copy[i], n, step);
addChildNode(parent_node, n);
}
}
}
void ert_build_kmertree(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], int curDepth, node_t *parent_node, int step, int max_depth)
{
uint8_t uniform_bp = 0;
int numBranches = 0, depth = curDepth;
bwt_extend(bid->bwt, &ik, ok, 0); //!< Extend right by 1bp
/// Check if we need to make a uniform entry
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
bwtintv_t ik_new;
/// If we find a uniform entry, extend further till we diverge or hit a leaf node
if (numBranches == 1) {
uint8_t uniformExtend = 1;
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
assert(depth >= 0);
n->seq[depth] = uniform_bp; // ?放在最后的位置上
n->numHits = ok[uniform_bp].x[2];
n->l_seq = depth + 1; // depth=16
n->pos = depth;
n->num_bp = 1;
addChildNode(parent_node, n);
if (depth < max_depth) {
bwtintv_t ok_init;
memcpy_bwamem(&ok_init, sizeof(bwtintv_t), &ok[uniform_bp], sizeof(bwtintv_t), __FILE__, __LINE__);
while (uniformExtend) { // 一个一个bp进行扩展
numBranches = 0; uniform_bp = 0;
depth += 1;
bwt_extend(bid->bwt, &ik_new, ok, 0); //!< Extend right by 1bp
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
assert(numBranches != 0);
if (numBranches == 1) { //<! Uniform
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
n->seq[depth] = uniform_bp;
n->l_seq = depth + 1;
n->num_bp++;
/// Hit a multi-hit leaf node
if (depth == max_depth) {
// if (test_max_intv < ik_new.x[2]) test_max_intv = ik_new.x[2];
// fprintf(stderr, "depth: %d, num hit: %ld, max: %d\n", depth, ik_new.x[2], test_max_intv);
uniformExtend = 0;
handleLeaf(bid, ok_init, n, step);
}
} else { //!< Diverge
uniformExtend = 0;
n->type = UNIFORM;
handleDivergence(bid, ok, depth, n, step, max_depth);
}
}
} else { //<! Uniform, depth == max_depth, multi-hit leaf node
uniformExtend = 0;
handleLeaf(bid, ok[uniform_bp], n, step);
}
} //!< End uniform entry
else { //!< Diverge, empty or leaf, same as above
handleDivergence(bid, ok, depth, parent_node, step, max_depth);
} //!< End diverge
}
void ert_build_table(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], uint8_t *mlt_data, uint8_t *mh_data,
uint64_t *size, uint64_t *mh_size, uint8_t *aq, uint64_t *numHits, uint64_t *max_next_ptr,
uint64_t next_ptr_width, int step, int max_depth)
{
uint64_t byte_idx = *size;
uint64_t mh_byte_idx = *mh_size;
int i, j;
uint8_t aq1[xmerSize];
assert(xmerSize <= 15);
uint64_t lep1 = 0;
uint8_t c;
uint64_t prevHits = ik.x[2];
bwtintv_t ik_copy = ik;
uint64_t mlt_byte_idx = byte_idx + (numXmers << 3);
uint64_t xmer_entry = 0;
uint16_t xmer_data = 0;
uint64_t mlt_offset = mlt_byte_idx;
for (i = 0; i < numXmers; ++i) {
kmertoquery(i, aq1, xmerSize);
for (j = 0; j < xmerSize; ++j) {
c = 3 - aq1[j];
bwt_extend(bid->bwt, &ik, ok, 0); //!< ok contains the result of BWT extension
if (ok[c].x[2] != prevHits) { //!< hit set changes
lep1 |= (1ULL << j);
}
/// Extend right till k-mer has zero hits
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = kmerSize + j + 1; }
else { break; }
}
uint64_t num_hits = ok[c].x[2];
if (num_hits == 0) {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
}
else if (num_hits == 1) {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
if (step == 1) {
mlt_data[mlt_byte_idx] = 0; //!< Not a multi-hit
}
mlt_byte_idx++;
if (step == 1) {
uint64_t ref_pos = 0;
ref_pos = fmt_sa(bid->fmt, ok[c].x[0]);
// ref_pos = bwt_sa(bid->bwt, ok[c].x[0]);
uint64_t leaf_data = ref_pos << 1;
memcpy_bwamem(&mlt_data[mlt_byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
mlt_byte_idx += 5;
*numHits += 1;
}
else {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->type = DIVERGE;
n->pos = 0;
n->num_bp = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize, __FILE__, __LINE__);
n->l_seq = kmerSize;
memcpy_bwamem(&n->seq[n->l_seq], xmerSize * sizeof(uint8_t), aq1, xmerSize * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq += xmerSize;
n->parent_node = 0;
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
n->start_addr = mlt_byte_idx;
ert_build_kmertree(bid, ik, ok, kmerSize + j, n, step, max_depth);
ert_traverse_kmertree(n, mlt_data, mh_data, &mlt_byte_idx, &mh_byte_idx, kmerSize + j, numHits,
max_next_ptr, next_ptr_width, step);
ert_destroy_kmertree(n);
}
if (num_hits < 20) {
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | (num_hits << 17) | xmer_data;
}
else {
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | xmer_data;
}
uint64_t ptr_width = (next_ptr_width < 4) ? next_ptr_width : 0;
xmer_entry |= (ptr_width << 22);
if (step == 1) {
memcpy_bwamem(&mlt_data[byte_idx], 8 * sizeof(uint8_t), &xmer_entry, 8 * sizeof(uint8_t), __FILE__, __LINE__);
}
byte_idx += 8;
mlt_offset = mlt_byte_idx;
ik = ik_copy;
prevHits = ik_copy.x[2];
}
*size = mlt_byte_idx;
*mh_size = mh_byte_idx;
}
void addCode(uint8_t *mlt_data, uint64_t *byte_idx, uint8_t code, int step)
{
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint8_t), &code, sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 1;
}
void addUniformNode(uint8_t *mlt_data, uint64_t *byte_idx, int count, uint8_t *uniformBases, uint64_t hitCount, int step)
{
int numBytesForBP = addBytesForEntry(UNIFORM_BP, count, 0);
assert(count < ERT_MAX_READ_LEN);
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint16_t), &count, sizeof(uint16_t), __FILE__, __LINE__);
}
*byte_idx += 2;
if (step == 1) {
int j;
uint8_t packUniformBases[numBytesForBP];
memset(packUniformBases, 0, numBytesForBP * sizeof(uint8_t));
for (j = 0; j < count; ++j) {
_set_pac_orig(packUniformBases, j, uniformBases[j]);
}
memcpy_bwamem(&mlt_data[*byte_idx], numBytesForBP * sizeof(uint8_t), packUniformBases, numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += numBytesForBP;
}
void addLeafNode(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t ref_pos, int step) {
if (step == 1) {
uint64_t leaf_data = (ref_pos << 1);
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
void addMultiHitLeafNode(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, uint64_t* hits, int step)
{
uint16_t k = 0;
for (k = 0; k < count; ++k) {
if (step == 1) {
uint64_t leaf_data = (hits[k] << 1) | 1ULL;
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
}
void addMultiHitLeafCount(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, int step)
{
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], 2 * sizeof(uint8_t), &count, 2 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 2;
}
void addMultiHitLeafPtr(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t mh_byte_idx, int step)
{
if (step == 1) {
uint64_t mh_data = (mh_byte_idx << 1) | 1ULL;
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &mh_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
void ert_traverse_kmertree(node_t *n, uint8_t *mlt_data, uint8_t *mh_data, uint64_t *size, uint64_t *mh_size, int depth, uint64_t *numHits, uint64_t *max_ptr, uint64_t next_ptr_width, int step)
{
int j = 0;
int cur_depth = depth;
uint64_t byte_idx = *size;
uint64_t mh_byte_idx = *mh_size; // 匹配位置开始记录的索引位置
uint8_t code = 0;
assert(n->numChildren != 0);
if (n->numChildren == 1) {
node_t *child = n->child_nodes[0];
uint8_t c = child->seq[child->pos];
if (child->type == LEAF) {
//
// FIXME: In rare cases, when one of the occurrences of the k-mer is at the end of the reference,
// # hits for parent node is not equal to the sum of #hits of children nodes, and we trigger the assertion below
// This should not affect results as long as readLength > kmerSize
// assert(child->numHits > 1);
code |= (LEAF << (c << 1));
addCode(mlt_data, &byte_idx, code, step);
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
*numHits += child->numHits;
}
else {
assert(child->type == UNIFORM);
code |= (UNIFORM << (c << 1));
addCode(mlt_data, &byte_idx, code, step);
addUniformNode(mlt_data, &byte_idx, child->num_bp, &child->seq[child->pos], child->numHits, step);
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx, cur_depth + child->num_bp,
numHits, max_ptr, next_ptr_width, step);
}
}
else {
uint8_t numEmpty = 0, numLeaves = 0;
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
uint8_t c = child->seq[child->pos];
if (child->type == EMPTY) { numEmpty++; }
else if (child->type == LEAF) { numLeaves++; code |= (LEAF << (c << 1)); }
else { code |= (DIVERGE << (c << 1)); }
}
uint8_t numPointers = ((4 - numEmpty - numLeaves) > 0) ? (4 - numEmpty - numLeaves) : 0;
uint64_t start_byte_idx = byte_idx;
addCode(mlt_data, &byte_idx, code, step);
uint64_t ptr_byte_idx = byte_idx;
uint64_t ptrToOtherNodes[numPointers + 1]; //!< These point to children. We have one more child than number of pointers
memset(ptrToOtherNodes, 0, (numPointers + 1) * sizeof(uint64_t));
uint64_t numHitsForChildren[numPointers + 1];
memset(numHitsForChildren, 0, (numPointers + 1) * sizeof(uint64_t));
uint64_t other_idx = 0;
if (numPointers > 0) { byte_idx += (numPointers * next_ptr_width); }
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
if (child->type == LEAF) {
if (child->numHits == 1) {
addLeafNode(mlt_data, &byte_idx, child->hits[0], step);
}
else {
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
}
}
}
if (numPointers > 0) { ptrToOtherNodes[other_idx] = byte_idx; }
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
assert(child->type != UNIFORM); // why?
if (child->type == DIVERGE) {
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx,
cur_depth+1, numHits, max_ptr, next_ptr_width, step);
numHitsForChildren[other_idx] = child->numHits;
other_idx++;
ptrToOtherNodes[other_idx] = byte_idx;
}
}
for (j = 0; j < numPointers; ++j) {
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
if (pointerToNextNode > *max_ptr) {
*max_ptr = pointerToNextNode;
}
assert(pointerToNextNode < (1 << 26));
}
// Fill up pointers based on size of previous children
if (step == 1) {
for (j = 0; j < numPointers; ++j) {
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
assert(pointerToNextNode < (1 << 26));
uint64_t reseed_data = 0;
if (numHitsForChildren[j] < 20) {
reseed_data = (pointerToNextNode << 6) | (numHitsForChildren[j]);
} else {
reseed_data = (pointerToNextNode << 6);
}
memcpy_bwamem(&mlt_data[ptr_byte_idx], next_ptr_width * sizeof(uint8_t), &reseed_data, next_ptr_width * sizeof(uint8_t), __FILE__, __LINE__);
ptr_byte_idx += next_ptr_width;
}
}
}
*size = byte_idx;
*mh_size = mh_byte_idx;
}
void ert_destroy_kmertree(node_t* n)
{
int j;
if (n == NULL) {
return;
}
if (n->hits) {
free(n->hits);
}
for (j = 0; j < n->numChildren; ++j) {
ert_destroy_kmertree(n->child_nodes[j]);
}
free(n);
}
//
// This function builds the ERT index.
// Note on pointers to child nodes: When building the radix tree for each k-mer,
// we try 3 values for pointers to child nodes, 2,3,4 B and choose the smallest
// one possible.
//
void *buildIndex(void *arg)
{
thread_data_t *data = (thread_data_t *)arg;
bwtintv_t ik, ok[4];
uint64_t idx = 0;
uint8_t aq[kmerSize];
int i;
uint8_t c;
uint64_t lep, prevHits, numBytesPerKmer, numBytesForMh, ref_pos, total_hits = 0, ptr = 0, max_next_ptr = 0;
uint64_t next_ptr_width = 0;
uint64_t nKmerSmallPtr = 0, nKmerMedPtr = 0, nKmerLargePtr = 0;
uint16_t kmer_data = 0;
// File to write the multi-level tree index
char ml_tbl_file_name[PATH_MAX];
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%d", data->filePrefix, data->tid);
// Log progress
char log_file_name[PATH_MAX];
sprintf(log_file_name, "%s.log_%d", data->filePrefix, data->tid);
FILE *ml_tbl_fd = 0, *log_fd = 0;
ml_tbl_fd = fopen(ml_tbl_file_name, "wb");
if (ml_tbl_fd == NULL)
{
printf("\nCan't open file or file doesn't exist. mlt_table errno = %d\n", errno);
pthread_exit(NULL);
}
if (bwa_verbose >= 4)
{
log_fd = fopen(log_file_name, "w");
if (log_fd == NULL)
{
printf("\nCan't open file or file doesn't exist. log errno = %d\n", errno);
pthread_exit(NULL);
}
log_file(log_fd, "Start: %lu End: %lu", data->startKmer, data->endKmer);
}
//
// Loop for each k-mer and compute LEP when the hit set changes
//
double tmp_time = realtime(), elapsed_time;
int p = 0;
for (idx = data->startKmer; idx < data->endKmer; ++idx)
{
max_next_ptr = 0;
next_ptr_width = 0;
c = 0;
lep = 0; // k-1-bit LEP
prevHits = 0;
numBytesPerKmer = 0;
numBytesForMh = 0;
kmertoquery(idx, aq, kmerSize); // represent k-mer as uint8_t*
assert(aq[0] >= 0 && aq[0] <= 3);
bwt_set_intv(data->bid->bwt, aq[0], ik); // the initial interval of a single base
ik.info = 1;
prevHits = ik.x[2];
// 打印进度信息
if (data->tid == 0 && (idx - data->startKmer) % ((data->endKmer - data->startKmer) / 100) == 0) {
if (p++ > 0) {
elapsed_time = realtime() - tmp_time;
fprintf(stderr, "[step: %d] %d%% percent complished. %f s elapsed.\n", data->step, p - 1, elapsed_time);
}
}
//
// Backward search k-mer
//
for (i = 1; i < kmerSize; ++i)
{
c = 3 - aq[i];
bwt_extend(data->bid->bwt, &ik, ok, 0); // ok contains the result of BWT extension
if (ok[c].x[2] != prevHits) { // hit set changes
lep |= (1ULL << (i - 1));
}
//
// Extend left till k-mer has zero hits
//
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = i + 1; }
else { break; }
}
uint64_t num_hits = ok[c].x[2];
if (num_hits == 0) { // "Empty" - k-mer absent in the reference genome
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
} else if (num_hits == 1) { // "Leaf" - k-mer has a single hit in the reference genome
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
numBytesPerKmer = 6; // 该15-kmer下所包含的所有匹配位置信息只需要6字节就能表示因为只有一个匹配
uint8_t byte_idx = 0;
uint8_t mlt_data[numBytesPerKmer];
if (data->step == 1) { mlt_data[byte_idx] = 0; } // Mark that the hit is not a multi-hit
byte_idx++;
data->numHits[idx - data->startKmer] += num_hits;
if (data->step == 1) {
// Look up suffix array to identify the hit position
ref_pos = fmt_sa(data->bid->fmt, ok[c].x[0]);
// ref_pos = bwt_sa(data->bid->bwt, ok[c].x[0]);
uint64_t leaf_data = ref_pos << 1;
memcpy_bwamem(&mlt_data[byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
}
byte_idx += 5;
}
//
// If the number of hits for the k-mer does not exceed the HIT_THRESHOLD,
// prefer a radix-tree over a multi-level table as the radix tree for the
// k-mer is likely to be sparse.
//
else if (num_hits <= HIT_THRESHOLD) {
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->type = DIVERGE;
n->pos = 0;
n->num_bp = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq = kmerSize;
n->parent_node = 0;
n->numChildren = 0;
n->numHits = num_hits;
n->child_nodes[0] = n->child_nodes[1] = n->child_nodes[2] = n->child_nodes[3] = 0;
n->start_addr = 0;
uint8_t *mlt_data = 0;
next_ptr_width = 2;
uint8_t *mh_data = 0;
uint64_t size = 0;
if (data->step == 1) {
if (idx != (numKmers - 1)) { // 不等于最后一个处理的kmer
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH); // 本kmer所占存储大小
assert(size < (1 << 26)); // 小于64MB
} else { // FIXME: This is a hack. We know the size of every k-mer tree except the last-kmer
size = 1 << 26;
}
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mlt_data != NULL);
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mh_data != NULL);
}
ert_build_kmertree(data->bid, ik, ok, i, n, data->step, data->readLength - 1); // n作为parent node
// Reserve space for pointer to start of multi-hit address space
numBytesPerKmer = 4; // 4个字节的指针
// Traverse tree and place data in memory space
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh,
i, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
if (data->step == 0 || data->step == 1) {
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
next_ptr_width = 3;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
}
if (max_next_ptr >= 262144) {
next_ptr_width = 4;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
}
}
ert_destroy_kmertree(n);
assert(numBytesPerKmer < (1 << 26));
// assert(numBytesForMh < (1 << 24));
if (data->step == 1) {
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
free(mlt_data);
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
free(mh_data);
}
}
//
// If the number of hits for the k-mer exceeds the HIT_THRESHOLD,
// prefer a multi-level table to encode the suffixes for the
// k-mer
//
else {
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (FREQUENT);
uint8_t *mlt_data = 0;
uint8_t *mh_data = 0;
next_ptr_width = 2;
uint64_t size = 0;
if (data->step == 1) {
if (idx != (numKmers - 1)) {
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH);
assert(size < (1 << 26));
}
else { //!< FIXME: Hack. We do not store the size of the last-kmer
size = 1 << 26;
}
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mlt_data != NULL);
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mh_data != NULL);
}
numBytesPerKmer = 4;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data, &numBytesPerKmer,
&numBytesForMh, aq, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step,
data->readLength - 1);
if (data->step == 0 || data->step == 1) {
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
next_ptr_width = 3;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
}
if (max_next_ptr >= 262144) {
next_ptr_width = 4;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
}
}
//
// Traverse tree and place data in memory
//
assert(numBytesPerKmer < (1 << 26));
// assert(numBytesForMh < (1 << 24));
if (data->step == 1) {
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
free(mlt_data);
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
free(mh_data);
}
}
if (num_hits < 20) {
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | (num_hits << 17) | kmer_data;
}
else {
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | kmer_data;
}
ptr += (numBytesPerKmer + numBytesForMh);
if (next_ptr_width == 2) {
nKmerSmallPtr++;
}
else if (next_ptr_width == 3) {
nKmerMedPtr++;
}
else if (next_ptr_width == 4) {
nKmerLargePtr++;
next_ptr_width = 0;
}
data->kmer_table[idx - data->startKmer] |= (next_ptr_width << 22);
if (bwa_verbose >= 4) {
if (idx == data->endKmer-1) {
log_file(log_fd, "TotalSize:%lu\n", ptr);
}
if ((idx-data->startKmer) % 10000000 == 0) {
log_file(log_fd, "%lu,%lu,%lu", idx, numBytesPerKmer, ptr);
}
}
total_hits += data->numHits[idx - data->startKmer];
}
data->end_offset = ptr;
if (bwa_verbose >= 4) {
log_file(log_fd, "Hits:%lu\n", total_hits);
log_file(log_fd, "nKmersSmallPtrs:%lu", nKmerSmallPtr);
log_file(log_fd, "nKmersMedPtrs:%lu", nKmerMedPtr);
log_file(log_fd, "nKmersLargePtrs:%lu", nKmerLargePtr);
fclose(log_fd);
}
fclose(ml_tbl_fd);
pthread_exit(NULL);
}
void buildERTKmerTrees(char *kmer_tbl_file_name, bwaidx_t *bid, char *prefix, int num_threads, int readLength)
{
FILE *kmer_tbl_fd;
pthread_t thr[num_threads];
int i, rc;
thread_data_t thr_data[num_threads];
uint64_t numKmersThread = (uint64_t)ceil(((double)(numKmers)) / num_threads);
if (bwa_verbose >= 3)
{
fprintf(stderr, "[M::%s] Computing tree sizes for each k-mer\n", __func__);
}
//
// STEP 1: Create threads. Each thread builds the index for a fraction of the k-mers
//
for (i = 0; i < num_threads; ++i)
{
thr_data[i].tid = i;
thr_data[i].step = 0;
thr_data[i].readLength = readLength;
thr_data[i].bid = bid;
thr_data[i].startKmer = i * numKmersThread;
thr_data[i].endKmer = ((i + 1) * numKmersThread > numKmers) ? numKmers : (i + 1) * numKmersThread;
thr_data[i].end_offset = 0;
thr_data[i].filePrefix = prefix;
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
thr_data[i].kmer_table = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
assert(thr_data[i].kmer_table != NULL);
thr_data[i].numHits = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
assert(thr_data[i].numHits != NULL);
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i])))
{
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
return;
}
}
//
// block until all threads complete
//
for (i = 0; i < num_threads; ++i)
{
pthread_join(thr[i], NULL);
}
//
// Compute absolute offsets for each kmer's tree from per-thread relative offsets
//
uint64_t *kmer_table = (uint64_t *)calloc(numKmers, sizeof(uint64_t));
assert(kmer_table != NULL);
int tidx;
uint64_t kidx;
uint64_t numProcessed = 0;
uint64_t offset = 0;
for (tidx = 0; tidx < num_threads; ++tidx)
{
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
for (kidx = 0; kidx < numKmersToProcess; ++kidx)
{
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
| (ptr_width << 22)
| (reseed_hits << 17)
| (kmer_data);
}
numProcessed += numKmersToProcess;
offset += thr_data[tidx].end_offset;
free(thr_data[tidx].kmer_table);
free(thr_data[tidx].numHits);
}
//
// STEP 2 : Using estimates of each k-mer's tree size from the previous step, write the index to file
//
uint64_t total_size = offset + (numKmers * 8UL);
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s] Total size of ERT index = %lu B (Expected). (k-mer,tree) = (%lu,%lu)\n", __func__, total_size, numKmers * 8UL, offset);
}
// return;
for (i = 0; i < num_threads; ++i) {
thr_data[i].tid = i;
thr_data[i].step = 1;
thr_data[i].readLength = readLength;
thr_data[i].bid = bid;
thr_data[i].startKmer = i*numKmersThread;
thr_data[i].endKmer = ((i + 1)*numKmersThread > numKmers) ? numKmers : (i + 1)*numKmersThread;
thr_data[i].end_offset = 0;
thr_data[i].filePrefix = prefix;
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
thr_data[i].kmer_table = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
thr_data[i].numHits = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
thr_data[i].byte_offsets = kmer_table;
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i]))) {
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
return;
}
}
for (i = 0; i < num_threads; ++i) {
pthread_join(thr[i], NULL);
}
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s] Merging per-thread tables ...\n", __func__);
}
//
// Compute absolute offsets for each k-mer tree's root node
//
numProcessed = 0;
offset = 0;
for (tidx = 0; tidx < num_threads; ++tidx) {
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
for (kidx = 0; kidx < numKmersToProcess; ++kidx) {
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
| (ptr_width << 22)
| (reseed_hits << 17)
| (kmer_data);
}
numProcessed += numKmersToProcess;
offset += thr_data[tidx].end_offset;
free(thr_data[tidx].kmer_table);
free(thr_data[tidx].numHits);
}
kmer_tbl_fd = fopen(kmer_tbl_file_name, "wb");
if (kmer_tbl_fd == NULL) {
fprintf(stderr, "[M::%s] Can't open file or file doesn't exist.\n", __func__);
exit(1);
}
fwrite(kmer_table, sizeof(uint64_t), numKmers, kmer_tbl_fd);
fclose(kmer_tbl_fd);
free(kmer_table);
//
// Merge all per-thread trees
//
const int file_buf_size = 64 * 1024 * 1024;
uint8_t *file_buf = (uint8_t *)malloc(file_buf_size); // 64MB
char ml_tbl_file_name[PATH_MAX] = {};
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
if (remove(ml_tbl_file_name) == 0) {
fprintf(stderr, "[M::%s] Overwriting existing index file (tree)\n", __func__);
}
FILE *o_mlt = fopen(ml_tbl_file_name, "wb");
if (o_mlt == NULL)
{
fprintf(stderr, "[M::%s] Can't open output index file for writing.\n", __func__);
exit(1);
}
for (uint64_t tidx = 0; tidx < num_threads; ++tidx) {
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%ld", prefix, tidx);
FILE *i_mlt = fopen(ml_tbl_file_name, "rb");
if (i_mlt == NULL) {
fprintf(stderr, "[M::%s] Can't open per-thread index file for thread %ld\n", __func__, tidx);
exit(1);
}
int fr = 0;
while((fr = fread(file_buf, 1, file_buf_size, i_mlt)) != 0) {
fwrite(file_buf, 1, fr, o_mlt);
}
if (remove(ml_tbl_file_name) != 0) {
fprintf(stderr, "[M::%s] Can't remove per-thread index file (tree) for thread %ld\n", __func__, tidx);
exit(1);
}
}
free(file_buf);
}

106
ertindex.h 100644
View File

@ -0,0 +1,106 @@
#ifndef BWA_ERT_H
#define BWA_ERT_H
#include <stdint.h>
#include "kvec.h"
#include "bwa.h"
#define ERT_MAX_READ_LEN 301
#define kmerSize 15
#define numKmers 1073741824
#define xmerSize 4
#define numXmers 256
#define TILE_SIZE 64
// #define PRINT_SMEM
#define PREFIX_LENGTH 3
#define LEP_MASK 0x3FFF
#define KMER_DATA_BITWIDTH 24
#define KMER_DATA_MASK 0xFFFF
#define METADATA_BITWIDTH 2
#define METADATA_MASK 0x3
#define INVALID 0
#define SINGLE_HIT_LEAF 1
#define INFREQUENT 2
#define FREQUENT 3
#define HIT_THRESHOLD 256
#define DRAM_PAGE_SIZE 24576
#define LEAF_TBL_BASE_PTR_WIDTH 3
#define LEAF_TBL_HIT_COUNT_WIDTH 3
#define MAX_HITS_PER_READ 2000000
typedef enum
{
EMPTY,
LEAF,
UNIFORM,
DIVERGE
} note_type_t;
typedef struct _node_t
{
note_type_t type;
int pos;
int num_bp;
int l_seq;
int numChildren;
uint64_t numHits;
uint64_t start_addr;
uint64_t *hits;
uint8_t seq[ERT_MAX_READ_LEN + 1];
struct _node_t *parent_node;
struct _node_t *child_nodes[4];
} node_t;
typedef kvec_t(node_t) node_v;
typedef node_t *node_ptr_t;
typedef struct
{
int tid;
int step;
int readLength;
uint64_t *kmer_table;
uint64_t startKmer;
uint64_t endKmer;
bwaidx_t *bid;
uint64_t *numHits;
char *filePrefix;
uint64_t *byte_offsets;
uint64_t end_offset;
} thread_data_t;
// FIXME : Add to options later
extern const uint8_t char_count_size_in_bits;
extern const uint8_t hits_count_size_in_bits;
extern const uint8_t ref_ptr_size_in_bits;
extern const uint8_t leaf_offset_ptr_size_in_bits;
extern const uint8_t other_offset_ptr_size_in_bits;
typedef enum
{
CODE,
EMPTY_NODE,
LEAF_COUNT,
LEAF_HITS,
UNIFORM_COUNT,
UNIFORM_BP,
LEAF_PTR,
OTHER_PTR
} byte_type_t;
void ert_build_kmertree(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], int curDepth, node_t *parent_node, int step, int max_depth);
void handleDivergence(const bwaidx_t *bid, bwtintv_t ok[4], int depth, node_t *parent_node, int step, int max_depth);
void handleLeaf(const bwaidx_t *bid, bwtintv_t ik, node_t *n, int step);
void ert_build_table(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], uint8_t *mlt_data, uint8_t *mh_data, uint64_t *size, uint64_t *mh_size, uint8_t *aq, uint64_t *numHits, uint64_t *max_next_ptr, uint64_t next_ptr_width, int step, int max_depth);
void ert_traverse_kmertree(node_t *n, uint8_t *mlt_data, uint8_t *mh_data, uint64_t *byte_idx, uint64_t *mh_byte_idx, int depth, uint64_t *numHits, uint64_t *max_ptr, uint64_t next_ptr_width, int step);
void ert_destroy_kmertree(node_t *n);
void buildERTKmerTrees(char *kmer_tbl_file_name, bwaidx_t *bid, char *prefix, int num_threads, int readLength);
#endif

3511
ertseeding.c 100644

File diff suppressed because it is too large Load Diff

126
ertseeding.h 100644
View File

@ -0,0 +1,126 @@
#ifndef ERTSEEDING_HPP
#define ERTSEEDING_HPP
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <limits.h>
#include <math.h>
#include "kstring.h"
#include "ksw.h"
#include "kvec.h"
#include "ksort.h"
#include "utils.h"
#include "bwt.h"
#include "ertindex.h"
#include "bntseq.h"
#include "bwa.h"
#include "profiling.h"
#include "utils.h"
/**
* Node state to keep track of while traversing ERT.
*/
typedef struct
{
int num_hits;
uint64_t byte_idx;
} node_info_t;
typedef kvec_t(uint64_t) u64v;
typedef kvec_t(uint8_t) u8v;
typedef kvec_t(int) intv;
typedef kvec_t(node_info_t) path_v;
/**
* State to keep track of previous pivots for each new MEM search
*/
typedef struct
{
int c_pivot; // Pivot used to generate the SMEM
int p_pivot; // Previous pivot
int pp_pivot; // Pivot before the previous pivot. Useful in reseeding
} pivot_t;
/**
* State for each maximal-exact-match (MEM)
*/
typedef struct
{
uint8_t forward; // RMEM or LMEM. We need this to normalize hit positions
int start; // MEM start position in read
int end; // MEM end position in read. [start, end)
int rc_start; // MEM start position in reverse complemented (RC) read (used for backward search)
int rc_end; // MEM end position in reverse complemented (RC) read (used for backward search)
int skip_ref_fetch; // Skip reference fetch when leaf node need not be decompressed
int fetch_leaves; // Gather all leaves for MEM
int hitbeg; // Index into hit array
int hitcount; // Count of hits
int end_correction; // Amount by which MEM has extended beyond backward search start position in read
int is_multi_hit;
pivot_t pt;
} mem_t;
typedef kvec_t(mem_t) mem_v;
/**
* Index-related auxiliary data structures
*/
typedef struct
{
uint64_t *kmer_offsets; // K-mer table
uint8_t *mlt_table; // Multi-level ERT
const bwt_t *bwt; // FM-index
const bntseq_t *bns; // Input reads sequences
const uint8_t *pac; // Reference genome (2-bit encoded)
uint8_t *ref_string;
} index_aux_t;
/**
* 'Read' auxiliary data structures
*/
typedef struct
{
int min_seed_len; // Minimum length of seed
int l_seq; // Read length
int ptr_width; // Size of pointers to child nodes in ERT
int num_hits; // Number of hits for each node in the ERT
int limit; // Number of hits after which extension must be stopped
uint64_t lep[5]; // FIXME: Can support up to 320bp
uint64_t nextLEPBit; // Index into the LEP bit-vector
uint64_t mlt_start_addr; // Start address of multi-level ERT
uint64_t mh_start_addr; // Start address of multi-hits for each k-mer
char *read_name; // Read name
uint8_t *unpacked_queue_buf; // Read sequence (2-bit encoded)
uint8_t *unpacked_rc_queue_buf; // Reverse complemented read (2-bit encoded)
uint8_t *read_buf; // == queue_buf (forward) and == rc_queue_buf (backward)
} read_aux_t;
/**
* SMEM helper data structure
*/
typedef struct
{
int prevMemStart; // Start position of previous MEM in the read
int prevMemEnd; // End position of previous MEM in the read
int curr_pivot; // Pivot used for forward/backward search in iteration i
int prev_pivot; // Pivot used in the previous iteration (i-1)
int prev_prev_pivot; // Pivot used in iteration i-2 (useful for reseeding)
int stop_be; // Stop backward search early if no new SMEMs can be found for pivot
int mem_end_limit;
} smem_helper_t;
void get_seeds(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, u64v *hits);
void get_seeds_prefix(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, u64v *hits);
void reseed(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, int start, int limit, pivot_t *pt, u64v *hits);
void reseed_prefix(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, int start, int limit, pivot_t *pt, u64v *hits);
void last(index_aux_t *iaux, read_aux_t *raux, mem_v *smems, int limit, u64v *hits);
#endif

View File

@ -78,6 +78,7 @@ typedef struct {
long read_idx;
long calc_idx;
long write_idx;
int useERT;
} ktp_aux_t;
// read
@ -370,6 +371,7 @@ int main_mem(int argc, char *argv[])
void *ko = 0, *ko2 = 0;
mem_pestat_t pes[4];
ktp_aux_t aux;
int useERT = 0;
memset(&aux, 0, sizeof(ktp_aux_t));
memset(pes, 0, 4 * sizeof(mem_pestat_t));
@ -377,7 +379,7 @@ int main_mem(int argc, char *argv[])
aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "512qpaMCSPVYjuk:c:v:s:r:t:b:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
while ((c = getopt(argc, argv, "512qpaMCSPVYjuk:c:v:s:r:t:b:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:Z")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == '1') no_mt_io = 1;
else if (c == '2') no_mt_io = 2;
@ -478,6 +480,9 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
}
else if (c == 'Z') {
useERT = 1;
}
else return 1;
}
@ -548,6 +553,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
fprintf(stderr, " -Z Use ERT index for seeding\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, "\n");
@ -589,9 +595,16 @@ int main_mem(int argc, char *argv[])
PROF_END(gprof[G_PREPARE], prepare);
PROF_START(idx);
aux.idx = bwa_idx_load_from_shm(argv[optind]);
if (useERT) aux.idx = bwa_ertidx_load_from_shm(argv[optind]);
else aux.idx = bwa_fmtidx_load_from_shm(argv[optind]);
if (aux.idx == 0) {
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
if (!useERT) {
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC | BWA_IDX_FMT)) == 0) return 1; // FIXME: memory leak
}
else {
if ((aux.idx = bwa_ertidx_load_from_disk(argv[optind])) == 0) return 1; // FIXME: memory leak
}
} else if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
if (ignore_alt)
@ -623,7 +636,7 @@ int main_mem(int argc, char *argv[])
}
//fprintf(stderr, "%ld %ld %ld %ld %ld\n", aux.idx->fmt->L2[0], aux.idx->fmt->L2[1], aux.idx->fmt->L2[2], aux.idx->fmt->L2[3], aux.idx->fmt->L2[4]);
//exit(0);
aux.useERT = useERT;
aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->bns, aux.idx->pac);
aux.data = calloc(2, sizeof(ktp_data_t));
@ -635,8 +648,10 @@ int main_mem(int argc, char *argv[])
aux.wbuf = malloc(aux.wbuf_size);
PROF_START(pipeline);
if (no_mt_io == 2) new_pipeline(&aux);
else kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
PROF_END(gprof[G_PIPELINE], pipeline);
// free(hdr_line);

View File

@ -8,7 +8,7 @@
#include "utils.h"
#include "ksw.h"
static const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
// static const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
typedef struct _kswq_t {
int qlen, slen;

3
main.c
View File

@ -40,6 +40,7 @@ int bwa_bwt2sa(int argc, char *argv[]);
int bwa_bwt2bytesa(int argc, char *argv[]);
int bwa_bwt2fmt(int argc, char *argv[]);
int bwa_build_kmer(int argc, char *argv[]);
int bwa_bwt2ert(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
@ -82,6 +83,7 @@ static int usage()
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
fprintf(stderr, " bwt2fmt generate FMT-Index from BWT\n");
fprintf(stderr, " buildkmer generate kmer index from FMT\n");
fprintf(stderr, " bwt2ert generate ert index from BWT\n");
fprintf(stderr, "\n");
fprintf(stderr,
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
@ -110,6 +112,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2fmt") == 0) ret = bwa_bwt2fmt(argc-1, argv+1);
else if (strcmp(argv[1], "buildkmer") == 0) ret = bwa_build_kmer(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2ert") == 0) ret = bwa_bwt2ert(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);

2
run.sh
View File

@ -44,7 +44,7 @@ out=~/data1/fmt-out.sam
#out=/dev/null
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
$reference \
-Z $reference \
$n_r1 \
$n_r2 \
-o $out -2

18
utils.c
View File

@ -327,3 +327,21 @@ long peakrss(void)
return r.ru_maxrss;
#endif
}
int memcpy_bwamem(void *dest, size_t dmax, const void *src, size_t smax, char *file_name, int line_num)
{
#define RSIZE_MAX_MEM (256UL << 20) /* 256MB */
if (dmax < smax)
{
fprintf(stderr, "[%s: %d] src size is lager than dest size.(src: %ld; dest: %ld)\n", file_name, line_num, smax, dmax);
exit(EXIT_FAILURE);
}
int64_t bytes_copied;
for (bytes_copied = 0; bytes_copied < smax; bytes_copied += RSIZE_MAX_MEM)
{
int64_t bytes_remaining = smax - bytes_copied;
int64_t bytes_to_copy = (bytes_remaining > RSIZE_MAX_MEM) ? RSIZE_MAX_MEM : bytes_remaining;
memcpy((char *)dest + bytes_copied, (const char *)src + bytes_copied, bytes_to_copy);
}
return 0;
}

View File

@ -80,6 +80,10 @@ static inline unsigned long long __rdtsc(void)
#endif
#endif
#define log_file(fd, M, ...) \
fprintf(fd, M "\n", ##__VA_ARGS__); \
fflush(fd)
typedef struct {
uint64_t x, y;
} pair64_t;
@ -129,6 +133,8 @@ extern "C" {
void ks_introsort_64 (size_t n, uint64_t *a);
void ks_introsort_128(size_t n, pair64_t *a);
int memcpy_bwamem(void *dest, size_t dmax, const void *src, size_t smax, char *file_name, int line_num);
#ifdef __cplusplus
}
#endif