fast-bwa/ertindex.c

947 lines
38 KiB
C
Raw Normal View History

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdint.h>
#include <limits.h>
#include <pthread.h>
#include <errno.h>
#include <math.h>
#include "utils.h"
#include "ertindex.h"
#define _set_pac(pac, l, c) ((pac)[(l) >> 2] |= (c) << (((l) & 3) << 1))
#define _set_pac_orig(pac, l, c) ((pac)[(l) >> 2] |= (c) << ((~(l) & 3) << 1))
const uint8_t char_count_size_in_bits = 8;
const uint8_t hits_count_size_in_bits = 8;
const uint8_t ref_ptr_size_in_bits = 40;
const uint8_t leaf_offset_ptr_size_in_bits = 8;
const uint8_t other_offset_ptr_size_in_bits = 32;
// int test_max_intv = 0;
static inline void getNumBranchesForKmer(bwtintv_t ok[4], int* numBranches, uint8_t* uniform_bp) {
uint8_t i;
for (i = 0; i < 4; ++i) {
if (ok[i].x[2] > 0) { *numBranches += 1; *uniform_bp = i; }
}
}
// 从右向左的顺序将kmer转换成query
static inline void kmertoquery(uint64_t x, uint8_t *a, int l)
{
int i;
for (i = 0; i < l; ++i) {
a[i] = (uint8_t)((x >> (i << 1)) & 0x3);
}
}
static inline uint64_t addBytesForEntry(byte_type_t type, int count, int numHits)
{
uint64_t numBits = 0;
switch(type) {
case CODE:
numBits = 8;
break;
case LEAF_COUNT:
numBits = (hits_count_size_in_bits);
break;
case LEAF_HITS:
numBits = (ref_ptr_size_in_bits * numHits);
break;
case UNIFORM_COUNT: // "Uniform"
numBits = (char_count_size_in_bits);
break;
case UNIFORM_BP:
numBits = (count << 1);
break;
case LEAF_PTR: // "Leaf Offset Pointer"
numBits = leaf_offset_ptr_size_in_bits;
break;
case OTHER_PTR:
numBits = other_offset_ptr_size_in_bits;
break;
case EMPTY_NODE:
numBits = 0;
break;
default :
break;
}
return (numBits % 8) ? (numBits / 8 + 1) : numBits / 8;
}
void addChildNode(node_t* p, node_t* c) {
assert(p->numChildren <= 3);
p->child_nodes[p->numChildren++] = c;
c->parent_node = p;
}
void handleLeaf(const bwaidx_t *bid, bwtintv_t ik, node_t *n, int step)
{
n->type = LEAF;
n->numHits = ik.x[2];
n->hits = (uint64_t *)calloc(n->numHits, sizeof(uint64_t));
assert(n->hits != NULL);
if (step == 1) {
uint64_t ref_pos = 0;
int j = 0;
for (j = 0; j < n->numHits; ++j) {
ref_pos = fmt_sa(bid->fmt, ik.x[0] + j);
// ref_pos = bwt_sa(bid->bwt, ik.x[0] + j);
n->hits[j] = ref_pos;
}
}
}
void handleDivergence(const bwaidx_t *bid, bwtintv_t ok[4], int depth, node_t *parent_node, int step, int max_depth)
{
int i;
bwtintv_t ok_copy[4];
bwtintv_t ik_new;
memcpy_bwamem(ok_copy, 4 * sizeof(bwtintv_t), ok, 4 * sizeof(bwtintv_t), __FILE__, __LINE__);
for (i = 3; i >= 0; --i) {
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
n->pos = 0;
n->num_bp = 0;
if (ok_copy[i].x[2] == 0) { //!< Empty node
n->type = EMPTY;
n->numHits = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, (depth - 1) * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq = depth;
addChildNode(parent_node, n);
} else if (ok_copy[i].x[2] > 1 && depth != max_depth) { // 没到最大长度,可以继续扩展
ik_new = ok_copy[i]; ik_new.info = depth + 1;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
assert(depth >= 0);
n->seq[depth] = i;
n->pos = depth;
n->num_bp = 1;
n->l_seq = depth + 1;
n->numHits = ok_copy[i].x[2];
n->type = DIVERGE;
addChildNode(parent_node, n);
ert_build_kmertree(bid, ik_new, ok, depth + 1, n, step, max_depth);
} else { // leaf intv.x[2] == 1 or depth = max_depth
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
n->seq[depth] = i;
n->pos = depth;
n->num_bp = 1;
n->l_seq = depth + 1;
handleLeaf(bid, ok_copy[i], n, step);
addChildNode(parent_node, n);
}
}
}
void ert_build_kmertree(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], int curDepth, node_t *parent_node, int step, int max_depth)
{
uint8_t uniform_bp = 0;
int numBranches = 0, depth = curDepth;
bwt_extend(bid->bwt, &ik, ok, 0); //!< Extend right by 1bp
/// Check if we need to make a uniform entry
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
bwtintv_t ik_new;
/// If we find a uniform entry, extend further till we diverge or hit a leaf node
if (numBranches == 1) {
uint8_t uniformExtend = 1;
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
assert(depth >= 0);
n->seq[depth] = uniform_bp; // ?放在最后的位置上
n->numHits = ok[uniform_bp].x[2];
n->l_seq = depth + 1; // depth=16
n->pos = depth;
n->num_bp = 1;
addChildNode(parent_node, n);
if (depth < max_depth) {
bwtintv_t ok_init;
memcpy_bwamem(&ok_init, sizeof(bwtintv_t), &ok[uniform_bp], sizeof(bwtintv_t), __FILE__, __LINE__);
while (uniformExtend) { // 一个一个bp进行扩展
numBranches = 0; uniform_bp = 0;
depth += 1;
bwt_extend(bid->bwt, &ik_new, ok, 0); //!< Extend right by 1bp
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
assert(numBranches != 0);
if (numBranches == 1) { //<! Uniform
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
n->seq[depth] = uniform_bp;
n->l_seq = depth + 1;
n->num_bp++;
/// Hit a multi-hit leaf node
if (depth == max_depth) {
// if (test_max_intv < ik_new.x[2]) test_max_intv = ik_new.x[2];
// fprintf(stderr, "depth: %d, num hit: %ld, max: %d\n", depth, ik_new.x[2], test_max_intv);
uniformExtend = 0;
handleLeaf(bid, ok_init, n, step);
}
} else { //!< Diverge
uniformExtend = 0;
n->type = UNIFORM;
handleDivergence(bid, ok, depth, n, step, max_depth);
}
}
} else { //<! Uniform, depth == max_depth, multi-hit leaf node
uniformExtend = 0;
handleLeaf(bid, ok[uniform_bp], n, step);
}
} //!< End uniform entry
else { //!< Diverge, empty or leaf, same as above
handleDivergence(bid, ok, depth, parent_node, step, max_depth);
} //!< End diverge
}
void ert_build_table(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], uint8_t *mlt_data, uint8_t *mh_data,
uint64_t *size, uint64_t *mh_size, uint8_t *aq, uint64_t *numHits, uint64_t *max_next_ptr,
uint64_t next_ptr_width, int step, int max_depth)
{
uint64_t byte_idx = *size;
uint64_t mh_byte_idx = *mh_size;
int i, j;
uint8_t aq1[xmerSize];
assert(xmerSize <= 15);
uint64_t lep1 = 0;
uint8_t c;
uint64_t prevHits = ik.x[2];
bwtintv_t ik_copy = ik;
uint64_t mlt_byte_idx = byte_idx + (numXmers << 3);
uint64_t xmer_entry = 0;
uint16_t xmer_data = 0;
uint64_t mlt_offset = mlt_byte_idx;
for (i = 0; i < numXmers; ++i) {
kmertoquery(i, aq1, xmerSize);
for (j = 0; j < xmerSize; ++j) {
c = 3 - aq1[j];
bwt_extend(bid->bwt, &ik, ok, 0); //!< ok contains the result of BWT extension
if (ok[c].x[2] != prevHits) { //!< hit set changes
lep1 |= (1ULL << j);
}
/// Extend right till k-mer has zero hits
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = kmerSize + j + 1; }
else { break; }
}
uint64_t num_hits = ok[c].x[2];
if (num_hits == 0) {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
}
else if (num_hits == 1) {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
if (step == 1) {
mlt_data[mlt_byte_idx] = 0; //!< Not a multi-hit
}
mlt_byte_idx++;
if (step == 1) {
uint64_t ref_pos = 0;
ref_pos = fmt_sa(bid->fmt, ok[c].x[0]);
// ref_pos = bwt_sa(bid->bwt, ok[c].x[0]);
uint64_t leaf_data = ref_pos << 1;
memcpy_bwamem(&mlt_data[mlt_byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
mlt_byte_idx += 5;
*numHits += 1;
}
else {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->type = DIVERGE;
n->pos = 0;
n->num_bp = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize, __FILE__, __LINE__);
n->l_seq = kmerSize;
memcpy_bwamem(&n->seq[n->l_seq], xmerSize * sizeof(uint8_t), aq1, xmerSize * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq += xmerSize;
n->parent_node = 0;
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
n->start_addr = mlt_byte_idx;
ert_build_kmertree(bid, ik, ok, kmerSize + j, n, step, max_depth);
ert_traverse_kmertree(n, mlt_data, mh_data, &mlt_byte_idx, &mh_byte_idx, kmerSize + j, numHits,
max_next_ptr, next_ptr_width, step);
ert_destroy_kmertree(n);
}
if (num_hits < 20) {
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | (num_hits << 17) | xmer_data;
}
else {
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | xmer_data;
}
uint64_t ptr_width = (next_ptr_width < 4) ? next_ptr_width : 0;
xmer_entry |= (ptr_width << 22);
if (step == 1) {
memcpy_bwamem(&mlt_data[byte_idx], 8 * sizeof(uint8_t), &xmer_entry, 8 * sizeof(uint8_t), __FILE__, __LINE__);
}
byte_idx += 8;
mlt_offset = mlt_byte_idx;
ik = ik_copy;
prevHits = ik_copy.x[2];
}
*size = mlt_byte_idx;
*mh_size = mh_byte_idx;
}
void addCode(uint8_t *mlt_data, uint64_t *byte_idx, uint8_t code, int step)
{
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint8_t), &code, sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 1;
}
void addUniformNode(uint8_t *mlt_data, uint64_t *byte_idx, int count, uint8_t *uniformBases, uint64_t hitCount, int step)
{
int numBytesForBP = addBytesForEntry(UNIFORM_BP, count, 0);
assert(count < ERT_MAX_READ_LEN);
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint16_t), &count, sizeof(uint16_t), __FILE__, __LINE__);
}
*byte_idx += 2;
if (step == 1) {
int j;
uint8_t packUniformBases[numBytesForBP];
memset(packUniformBases, 0, numBytesForBP * sizeof(uint8_t));
for (j = 0; j < count; ++j) {
_set_pac_orig(packUniformBases, j, uniformBases[j]);
}
memcpy_bwamem(&mlt_data[*byte_idx], numBytesForBP * sizeof(uint8_t), packUniformBases, numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += numBytesForBP;
}
void addLeafNode(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t ref_pos, int step) {
if (step == 1) {
uint64_t leaf_data = (ref_pos << 1);
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
void addMultiHitLeafNode(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, uint64_t* hits, int step)
{
uint16_t k = 0;
for (k = 0; k < count; ++k) {
if (step == 1) {
uint64_t leaf_data = (hits[k] << 1) | 1ULL;
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
}
void addMultiHitLeafCount(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, int step)
{
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], 2 * sizeof(uint8_t), &count, 2 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 2;
}
void addMultiHitLeafPtr(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t mh_byte_idx, int step)
{
if (step == 1) {
uint64_t mh_data = (mh_byte_idx << 1) | 1ULL;
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &mh_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
void ert_traverse_kmertree(node_t *n, uint8_t *mlt_data, uint8_t *mh_data, uint64_t *size, uint64_t *mh_size, int depth, uint64_t *numHits, uint64_t *max_ptr, uint64_t next_ptr_width, int step)
{
int j = 0;
int cur_depth = depth;
uint64_t byte_idx = *size;
uint64_t mh_byte_idx = *mh_size; // 匹配位置开始记录的索引位置
uint8_t code = 0;
assert(n->numChildren != 0);
if (n->numChildren == 1) {
node_t *child = n->child_nodes[0];
uint8_t c = child->seq[child->pos];
if (child->type == LEAF) {
//
// FIXME: In rare cases, when one of the occurrences of the k-mer is at the end of the reference,
// # hits for parent node is not equal to the sum of #hits of children nodes, and we trigger the assertion below
// This should not affect results as long as readLength > kmerSize
// assert(child->numHits > 1);
code |= (LEAF << (c << 1));
addCode(mlt_data, &byte_idx, code, step);
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
*numHits += child->numHits;
}
else {
assert(child->type == UNIFORM);
code |= (UNIFORM << (c << 1));
addCode(mlt_data, &byte_idx, code, step);
addUniformNode(mlt_data, &byte_idx, child->num_bp, &child->seq[child->pos], child->numHits, step);
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx, cur_depth + child->num_bp,
numHits, max_ptr, next_ptr_width, step);
}
}
else {
uint8_t numEmpty = 0, numLeaves = 0;
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
uint8_t c = child->seq[child->pos];
if (child->type == EMPTY) { numEmpty++; }
else if (child->type == LEAF) { numLeaves++; code |= (LEAF << (c << 1)); }
else { code |= (DIVERGE << (c << 1)); }
}
uint8_t numPointers = ((4 - numEmpty - numLeaves) > 0) ? (4 - numEmpty - numLeaves) : 0;
uint64_t start_byte_idx = byte_idx;
addCode(mlt_data, &byte_idx, code, step);
uint64_t ptr_byte_idx = byte_idx;
uint64_t ptrToOtherNodes[numPointers + 1]; //!< These point to children. We have one more child than number of pointers
memset(ptrToOtherNodes, 0, (numPointers + 1) * sizeof(uint64_t));
uint64_t numHitsForChildren[numPointers + 1];
memset(numHitsForChildren, 0, (numPointers + 1) * sizeof(uint64_t));
uint64_t other_idx = 0;
if (numPointers > 0) { byte_idx += (numPointers * next_ptr_width); }
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
if (child->type == LEAF) {
if (child->numHits == 1) {
addLeafNode(mlt_data, &byte_idx, child->hits[0], step);
}
else {
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
}
}
}
if (numPointers > 0) { ptrToOtherNodes[other_idx] = byte_idx; }
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
assert(child->type != UNIFORM); // why?
if (child->type == DIVERGE) {
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx,
cur_depth+1, numHits, max_ptr, next_ptr_width, step);
numHitsForChildren[other_idx] = child->numHits;
other_idx++;
ptrToOtherNodes[other_idx] = byte_idx;
}
}
for (j = 0; j < numPointers; ++j) {
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
if (pointerToNextNode > *max_ptr) {
*max_ptr = pointerToNextNode;
}
assert(pointerToNextNode < (1 << 26));
}
// Fill up pointers based on size of previous children
if (step == 1) {
for (j = 0; j < numPointers; ++j) {
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
assert(pointerToNextNode < (1 << 26));
uint64_t reseed_data = 0;
if (numHitsForChildren[j] < 20) {
reseed_data = (pointerToNextNode << 6) | (numHitsForChildren[j]);
} else {
reseed_data = (pointerToNextNode << 6);
}
memcpy_bwamem(&mlt_data[ptr_byte_idx], next_ptr_width * sizeof(uint8_t), &reseed_data, next_ptr_width * sizeof(uint8_t), __FILE__, __LINE__);
ptr_byte_idx += next_ptr_width;
}
}
}
*size = byte_idx;
*mh_size = mh_byte_idx;
}
void ert_destroy_kmertree(node_t* n)
{
int j;
if (n == NULL) {
return;
}
if (n->hits) {
free(n->hits);
}
for (j = 0; j < n->numChildren; ++j) {
ert_destroy_kmertree(n->child_nodes[j]);
}
free(n);
}
//
// This function builds the ERT index.
// Note on pointers to child nodes: When building the radix tree for each k-mer,
// we try 3 values for pointers to child nodes, 2,3,4 B and choose the smallest
// one possible.
//
void *buildIndex(void *arg)
{
thread_data_t *data = (thread_data_t *)arg;
bwtintv_t ik, ok[4];
uint64_t idx = 0;
uint8_t aq[kmerSize];
int i;
uint8_t c;
uint64_t lep, prevHits, numBytesPerKmer, numBytesForMh, ref_pos, total_hits = 0, ptr = 0, max_next_ptr = 0;
uint64_t next_ptr_width = 0;
uint64_t nKmerSmallPtr = 0, nKmerMedPtr = 0, nKmerLargePtr = 0;
uint16_t kmer_data = 0;
// File to write the multi-level tree index
char ml_tbl_file_name[PATH_MAX];
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%d", data->filePrefix, data->tid);
// Log progress
char log_file_name[PATH_MAX];
sprintf(log_file_name, "%s.log_%d", data->filePrefix, data->tid);
FILE *ml_tbl_fd = 0, *log_fd = 0;
ml_tbl_fd = fopen(ml_tbl_file_name, "wb");
if (ml_tbl_fd == NULL)
{
printf("\nCan't open file or file doesn't exist. mlt_table errno = %d\n", errno);
pthread_exit(NULL);
}
if (bwa_verbose >= 4)
{
log_fd = fopen(log_file_name, "w");
if (log_fd == NULL)
{
printf("\nCan't open file or file doesn't exist. log errno = %d\n", errno);
pthread_exit(NULL);
}
log_file(log_fd, "Start: %lu End: %lu", data->startKmer, data->endKmer);
}
//
// Loop for each k-mer and compute LEP when the hit set changes
//
double tmp_time = realtime(), elapsed_time;
int p = 0;
for (idx = data->startKmer; idx < data->endKmer; ++idx)
{
max_next_ptr = 0;
next_ptr_width = 0;
c = 0;
lep = 0; // k-1-bit LEP
prevHits = 0;
numBytesPerKmer = 0;
numBytesForMh = 0;
kmertoquery(idx, aq, kmerSize); // represent k-mer as uint8_t*
assert(aq[0] >= 0 && aq[0] <= 3);
bwt_set_intv(data->bid->bwt, aq[0], ik); // the initial interval of a single base
ik.info = 1;
prevHits = ik.x[2];
// 打印进度信息
if (data->tid == 0 && (idx - data->startKmer) % ((data->endKmer - data->startKmer) / 100) == 0) {
if (p++ > 0) {
elapsed_time = realtime() - tmp_time;
fprintf(stderr, "[step: %d] %d%% percent complished. %f s elapsed.\n", data->step, p - 1, elapsed_time);
}
}
//
// Backward search k-mer
//
for (i = 1; i < kmerSize; ++i)
{
c = 3 - aq[i];
bwt_extend(data->bid->bwt, &ik, ok, 0); // ok contains the result of BWT extension
if (ok[c].x[2] != prevHits) { // hit set changes
lep |= (1ULL << (i - 1));
}
//
// Extend left till k-mer has zero hits
//
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = i + 1; }
else { break; }
}
uint64_t num_hits = ok[c].x[2];
if (num_hits == 0) { // "Empty" - k-mer absent in the reference genome
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
} else if (num_hits == 1) { // "Leaf" - k-mer has a single hit in the reference genome
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
numBytesPerKmer = 6; // 该15-kmer下所包含的所有匹配位置信息只需要6字节就能表示因为只有一个匹配
uint8_t byte_idx = 0;
uint8_t mlt_data[numBytesPerKmer];
if (data->step == 1) { mlt_data[byte_idx] = 0; } // Mark that the hit is not a multi-hit
byte_idx++;
data->numHits[idx - data->startKmer] += num_hits;
if (data->step == 1) {
// Look up suffix array to identify the hit position
ref_pos = fmt_sa(data->bid->fmt, ok[c].x[0]);
// ref_pos = bwt_sa(data->bid->bwt, ok[c].x[0]);
uint64_t leaf_data = ref_pos << 1;
memcpy_bwamem(&mlt_data[byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
}
byte_idx += 5;
}
//
// If the number of hits for the k-mer does not exceed the HIT_THRESHOLD,
// prefer a radix-tree over a multi-level table as the radix tree for the
// k-mer is likely to be sparse.
//
else if (num_hits <= HIT_THRESHOLD) {
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->type = DIVERGE;
n->pos = 0;
n->num_bp = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq = kmerSize;
n->parent_node = 0;
n->numChildren = 0;
n->numHits = num_hits;
n->child_nodes[0] = n->child_nodes[1] = n->child_nodes[2] = n->child_nodes[3] = 0;
n->start_addr = 0;
uint8_t *mlt_data = 0;
next_ptr_width = 2;
uint8_t *mh_data = 0;
uint64_t size = 0;
if (data->step == 1) {
if (idx != (numKmers - 1)) { // 不等于最后一个处理的kmer
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH); // 本kmer所占存储大小
assert(size < (1 << 26)); // 小于64MB
} else { // FIXME: This is a hack. We know the size of every k-mer tree except the last-kmer
size = 1 << 26;
}
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mlt_data != NULL);
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mh_data != NULL);
}
ert_build_kmertree(data->bid, ik, ok, i, n, data->step, data->readLength - 1); // n作为parent node
// Reserve space for pointer to start of multi-hit address space
numBytesPerKmer = 4; // 4个字节的指针
// Traverse tree and place data in memory space
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh,
i, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
if (data->step == 0 || data->step == 1) {
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
next_ptr_width = 3;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
}
if (max_next_ptr >= 262144) {
next_ptr_width = 4;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
}
}
ert_destroy_kmertree(n);
assert(numBytesPerKmer < (1 << 26));
// assert(numBytesForMh < (1 << 24));
if (data->step == 1) {
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
free(mlt_data);
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
free(mh_data);
}
}
//
// If the number of hits for the k-mer exceeds the HIT_THRESHOLD,
// prefer a multi-level table to encode the suffixes for the
// k-mer
//
else {
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (FREQUENT);
uint8_t *mlt_data = 0;
uint8_t *mh_data = 0;
next_ptr_width = 2;
uint64_t size = 0;
if (data->step == 1) {
if (idx != (numKmers - 1)) {
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH);
assert(size < (1 << 26));
}
else { //!< FIXME: Hack. We do not store the size of the last-kmer
size = 1 << 26;
}
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mlt_data != NULL);
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mh_data != NULL);
}
numBytesPerKmer = 4;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data, &numBytesPerKmer,
&numBytesForMh, aq, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step,
data->readLength - 1);
if (data->step == 0 || data->step == 1) {
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
next_ptr_width = 3;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
}
if (max_next_ptr >= 262144) {
next_ptr_width = 4;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
}
}
//
// Traverse tree and place data in memory
//
assert(numBytesPerKmer < (1 << 26));
// assert(numBytesForMh < (1 << 24));
if (data->step == 1) {
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
free(mlt_data);
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
free(mh_data);
}
}
if (num_hits < 20) {
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | (num_hits << 17) | kmer_data;
}
else {
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | kmer_data;
}
ptr += (numBytesPerKmer + numBytesForMh);
if (next_ptr_width == 2) {
nKmerSmallPtr++;
}
else if (next_ptr_width == 3) {
nKmerMedPtr++;
}
else if (next_ptr_width == 4) {
nKmerLargePtr++;
next_ptr_width = 0;
}
data->kmer_table[idx - data->startKmer] |= (next_ptr_width << 22);
if (bwa_verbose >= 4) {
if (idx == data->endKmer-1) {
log_file(log_fd, "TotalSize:%lu\n", ptr);
}
if ((idx-data->startKmer) % 10000000 == 0) {
log_file(log_fd, "%lu,%lu,%lu", idx, numBytesPerKmer, ptr);
}
}
total_hits += data->numHits[idx - data->startKmer];
}
data->end_offset = ptr;
if (bwa_verbose >= 4) {
log_file(log_fd, "Hits:%lu\n", total_hits);
log_file(log_fd, "nKmersSmallPtrs:%lu", nKmerSmallPtr);
log_file(log_fd, "nKmersMedPtrs:%lu", nKmerMedPtr);
log_file(log_fd, "nKmersLargePtrs:%lu", nKmerLargePtr);
fclose(log_fd);
}
fclose(ml_tbl_fd);
pthread_exit(NULL);
}
void buildERTKmerTrees(char *kmer_tbl_file_name, bwaidx_t *bid, char *prefix, int num_threads, int readLength)
{
FILE *kmer_tbl_fd;
pthread_t thr[num_threads];
int i, rc;
thread_data_t thr_data[num_threads];
uint64_t numKmersThread = (uint64_t)ceil(((double)(numKmers)) / num_threads);
if (bwa_verbose >= 3)
{
fprintf(stderr, "[M::%s] Computing tree sizes for each k-mer\n", __func__);
}
//
// STEP 1: Create threads. Each thread builds the index for a fraction of the k-mers
//
for (i = 0; i < num_threads; ++i)
{
thr_data[i].tid = i;
thr_data[i].step = 0;
thr_data[i].readLength = readLength;
thr_data[i].bid = bid;
thr_data[i].startKmer = i * numKmersThread;
thr_data[i].endKmer = ((i + 1) * numKmersThread > numKmers) ? numKmers : (i + 1) * numKmersThread;
thr_data[i].end_offset = 0;
thr_data[i].filePrefix = prefix;
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
thr_data[i].kmer_table = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
assert(thr_data[i].kmer_table != NULL);
thr_data[i].numHits = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
assert(thr_data[i].numHits != NULL);
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i])))
{
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
return;
}
}
//
// block until all threads complete
//
for (i = 0; i < num_threads; ++i)
{
pthread_join(thr[i], NULL);
}
//
// Compute absolute offsets for each kmer's tree from per-thread relative offsets
//
uint64_t *kmer_table = (uint64_t *)calloc(numKmers, sizeof(uint64_t));
assert(kmer_table != NULL);
int tidx;
uint64_t kidx;
uint64_t numProcessed = 0;
uint64_t offset = 0;
for (tidx = 0; tidx < num_threads; ++tidx)
{
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
for (kidx = 0; kidx < numKmersToProcess; ++kidx)
{
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
| (ptr_width << 22)
| (reseed_hits << 17)
| (kmer_data);
}
numProcessed += numKmersToProcess;
offset += thr_data[tidx].end_offset;
free(thr_data[tidx].kmer_table);
free(thr_data[tidx].numHits);
}
//
// STEP 2 : Using estimates of each k-mer's tree size from the previous step, write the index to file
//
uint64_t total_size = offset + (numKmers * 8UL);
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s] Total size of ERT index = %lu B (Expected). (k-mer,tree) = (%lu,%lu)\n", __func__, total_size, numKmers * 8UL, offset);
}
// return;
for (i = 0; i < num_threads; ++i) {
thr_data[i].tid = i;
thr_data[i].step = 1;
thr_data[i].readLength = readLength;
thr_data[i].bid = bid;
thr_data[i].startKmer = i*numKmersThread;
thr_data[i].endKmer = ((i + 1)*numKmersThread > numKmers) ? numKmers : (i + 1)*numKmersThread;
thr_data[i].end_offset = 0;
thr_data[i].filePrefix = prefix;
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
thr_data[i].kmer_table = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
thr_data[i].numHits = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
thr_data[i].byte_offsets = kmer_table;
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i]))) {
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
return;
}
}
for (i = 0; i < num_threads; ++i) {
pthread_join(thr[i], NULL);
}
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s] Merging per-thread tables ...\n", __func__);
}
//
// Compute absolute offsets for each k-mer tree's root node
//
numProcessed = 0;
offset = 0;
for (tidx = 0; tidx < num_threads; ++tidx) {
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
for (kidx = 0; kidx < numKmersToProcess; ++kidx) {
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
| (ptr_width << 22)
| (reseed_hits << 17)
| (kmer_data);
}
numProcessed += numKmersToProcess;
offset += thr_data[tidx].end_offset;
free(thr_data[tidx].kmer_table);
free(thr_data[tidx].numHits);
}
kmer_tbl_fd = fopen(kmer_tbl_file_name, "wb");
if (kmer_tbl_fd == NULL) {
fprintf(stderr, "[M::%s] Can't open file or file doesn't exist.\n", __func__);
exit(1);
}
fwrite(kmer_table, sizeof(uint64_t), numKmers, kmer_tbl_fd);
fclose(kmer_tbl_fd);
free(kmer_table);
//
// Merge all per-thread trees
//
const int file_buf_size = 64 * 1024 * 1024;
uint8_t *file_buf = (uint8_t *)malloc(file_buf_size); // 64MB
char ml_tbl_file_name[PATH_MAX] = {};
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
if (remove(ml_tbl_file_name) == 0) {
fprintf(stderr, "[M::%s] Overwriting existing index file (tree)\n", __func__);
}
FILE *o_mlt = fopen(ml_tbl_file_name, "wb");
if (o_mlt == NULL)
{
fprintf(stderr, "[M::%s] Can't open output index file for writing.\n", __func__);
exit(1);
}
for (uint64_t tidx = 0; tidx < num_threads; ++tidx) {
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%ld", prefix, tidx);
FILE *i_mlt = fopen(ml_tbl_file_name, "rb");
if (i_mlt == NULL) {
fprintf(stderr, "[M::%s] Can't open per-thread index file for thread %ld\n", __func__, tidx);
exit(1);
}
int fr = 0;
while((fr = fread(file_buf, 1, file_buf_size, i_mlt)) != 0) {
fwrite(file_buf, 1, fr, o_mlt);
}
if (remove(ml_tbl_file_name) != 0) {
fprintf(stderr, "[M::%s] Can't remove per-thread index file (tree) for thread %ld\n", __func__, tidx);
exit(1);
}
}
free(file_buf);
}