fast-bwa/ertindex.c

947 lines
38 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdint.h>
#include <limits.h>
#include <pthread.h>
#include <errno.h>
#include <math.h>
#include "utils.h"
#include "ertindex.h"
#define _set_pac(pac, l, c) ((pac)[(l) >> 2] |= (c) << (((l) & 3) << 1))
#define _set_pac_orig(pac, l, c) ((pac)[(l) >> 2] |= (c) << ((~(l) & 3) << 1))
const uint8_t char_count_size_in_bits = 8;
const uint8_t hits_count_size_in_bits = 8;
const uint8_t ref_ptr_size_in_bits = 40;
const uint8_t leaf_offset_ptr_size_in_bits = 8;
const uint8_t other_offset_ptr_size_in_bits = 32;
// int test_max_intv = 0;
static inline void getNumBranchesForKmer(bwtintv_t ok[4], int* numBranches, uint8_t* uniform_bp) {
uint8_t i;
for (i = 0; i < 4; ++i) {
if (ok[i].x[2] > 0) { *numBranches += 1; *uniform_bp = i; }
}
}
// 从右向左的顺序将kmer转换成query
static inline void kmertoquery(uint64_t x, uint8_t *a, int l)
{
int i;
for (i = 0; i < l; ++i) {
a[i] = (uint8_t)((x >> (i << 1)) & 0x3);
}
}
static inline uint64_t addBytesForEntry(byte_type_t type, int count, int numHits)
{
uint64_t numBits = 0;
switch(type) {
case CODE:
numBits = 8;
break;
case LEAF_COUNT:
numBits = (hits_count_size_in_bits);
break;
case LEAF_HITS:
numBits = (ref_ptr_size_in_bits * numHits);
break;
case UNIFORM_COUNT: // "Uniform"
numBits = (char_count_size_in_bits);
break;
case UNIFORM_BP:
numBits = (count << 1);
break;
case LEAF_PTR: // "Leaf Offset Pointer"
numBits = leaf_offset_ptr_size_in_bits;
break;
case OTHER_PTR:
numBits = other_offset_ptr_size_in_bits;
break;
case EMPTY_NODE:
numBits = 0;
break;
default :
break;
}
return (numBits % 8) ? (numBits / 8 + 1) : numBits / 8;
}
void addChildNode(node_t* p, node_t* c) {
assert(p->numChildren <= 3);
p->child_nodes[p->numChildren++] = c;
c->parent_node = p;
}
void handleLeaf(const bwaidx_t *bid, bwtintv_t ik, node_t *n, int step)
{
n->type = LEAF;
n->numHits = ik.x[2];
n->hits = (uint64_t *)calloc(n->numHits, sizeof(uint64_t));
assert(n->hits != NULL);
if (step == 1) {
uint64_t ref_pos = 0;
int j = 0;
for (j = 0; j < n->numHits; ++j) {
ref_pos = fmt_sa(bid->fmt, ik.x[0] + j);
// ref_pos = bwt_sa(bid->bwt, ik.x[0] + j);
n->hits[j] = ref_pos;
}
}
}
void handleDivergence(const bwaidx_t *bid, bwtintv_t ok[4], int depth, node_t *parent_node, int step, int max_depth)
{
int i;
bwtintv_t ok_copy[4];
bwtintv_t ik_new;
memcpy_bwamem(ok_copy, 4 * sizeof(bwtintv_t), ok, 4 * sizeof(bwtintv_t), __FILE__, __LINE__);
for (i = 3; i >= 0; --i) {
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
n->pos = 0;
n->num_bp = 0;
if (ok_copy[i].x[2] == 0) { //!< Empty node
n->type = EMPTY;
n->numHits = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, (depth - 1) * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq = depth;
addChildNode(parent_node, n);
} else if (ok_copy[i].x[2] > 1 && depth != max_depth) { // 没到最大长度,可以继续扩展
ik_new = ok_copy[i]; ik_new.info = depth + 1;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
assert(depth >= 0);
n->seq[depth] = i;
n->pos = depth;
n->num_bp = 1;
n->l_seq = depth + 1;
n->numHits = ok_copy[i].x[2];
n->type = DIVERGE;
addChildNode(parent_node, n);
ert_build_kmertree(bid, ik_new, ok, depth + 1, n, step, max_depth);
} else { // leaf intv.x[2] == 1 or depth = max_depth
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
n->seq[depth] = i;
n->pos = depth;
n->num_bp = 1;
n->l_seq = depth + 1;
handleLeaf(bid, ok_copy[i], n, step);
addChildNode(parent_node, n);
}
}
}
void ert_build_kmertree(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], int curDepth, node_t *parent_node, int step, int max_depth)
{
uint8_t uniform_bp = 0;
int numBranches = 0, depth = curDepth;
bwt_extend(bid->bwt, &ik, ok, 0); //!< Extend right by 1bp
/// Check if we need to make a uniform entry
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
bwtintv_t ik_new;
/// If we find a uniform entry, extend further till we diverge or hit a leaf node
if (numBranches == 1) {
uint8_t uniformExtend = 1;
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), parent_node->seq, parent_node->l_seq * sizeof(uint8_t), __FILE__, __LINE__);
assert(depth >= 0);
n->seq[depth] = uniform_bp; // ?放在最后的位置上
n->numHits = ok[uniform_bp].x[2];
n->l_seq = depth + 1; // depth=16
n->pos = depth;
n->num_bp = 1;
addChildNode(parent_node, n);
if (depth < max_depth) {
bwtintv_t ok_init;
memcpy_bwamem(&ok_init, sizeof(bwtintv_t), &ok[uniform_bp], sizeof(bwtintv_t), __FILE__, __LINE__);
while (uniformExtend) { // 一个一个bp进行扩展
numBranches = 0; uniform_bp = 0;
depth += 1;
bwt_extend(bid->bwt, &ik_new, ok, 0); //!< Extend right by 1bp
getNumBranchesForKmer(ok, &numBranches, &uniform_bp);
assert(numBranches != 0);
if (numBranches == 1) { //<! Uniform
ik_new = ok[uniform_bp]; ik_new.info = depth+1;
n->seq[depth] = uniform_bp;
n->l_seq = depth + 1;
n->num_bp++;
/// Hit a multi-hit leaf node
if (depth == max_depth) {
// if (test_max_intv < ik_new.x[2]) test_max_intv = ik_new.x[2];
// fprintf(stderr, "depth: %d, num hit: %ld, max: %d\n", depth, ik_new.x[2], test_max_intv);
uniformExtend = 0;
handleLeaf(bid, ok_init, n, step);
}
} else { //!< Diverge
uniformExtend = 0;
n->type = UNIFORM;
handleDivergence(bid, ok, depth, n, step, max_depth);
}
}
} else { //<! Uniform, depth == max_depth, multi-hit leaf node
uniformExtend = 0;
handleLeaf(bid, ok[uniform_bp], n, step);
}
} //!< End uniform entry
else { //!< Diverge, empty or leaf, same as above
handleDivergence(bid, ok, depth, parent_node, step, max_depth);
} //!< End diverge
}
void ert_build_table(const bwaidx_t *bid, bwtintv_t ik, bwtintv_t ok[4], uint8_t *mlt_data, uint8_t *mh_data,
uint64_t *size, uint64_t *mh_size, uint8_t *aq, uint64_t *numHits, uint64_t *max_next_ptr,
uint64_t next_ptr_width, int step, int max_depth)
{
uint64_t byte_idx = *size;
uint64_t mh_byte_idx = *mh_size;
int i, j;
uint8_t aq1[xmerSize];
assert(xmerSize <= 15);
uint64_t lep1 = 0;
uint8_t c;
uint64_t prevHits = ik.x[2];
bwtintv_t ik_copy = ik;
uint64_t mlt_byte_idx = byte_idx + (numXmers << 3);
uint64_t xmer_entry = 0;
uint16_t xmer_data = 0;
uint64_t mlt_offset = mlt_byte_idx;
for (i = 0; i < numXmers; ++i) {
kmertoquery(i, aq1, xmerSize);
for (j = 0; j < xmerSize; ++j) {
c = 3 - aq1[j];
bwt_extend(bid->bwt, &ik, ok, 0); //!< ok contains the result of BWT extension
if (ok[c].x[2] != prevHits) { //!< hit set changes
lep1 |= (1ULL << j);
}
/// Extend right till k-mer has zero hits
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = kmerSize + j + 1; }
else { break; }
}
uint64_t num_hits = ok[c].x[2];
if (num_hits == 0) {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
}
else if (num_hits == 1) {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
if (step == 1) {
mlt_data[mlt_byte_idx] = 0; //!< Not a multi-hit
}
mlt_byte_idx++;
if (step == 1) {
uint64_t ref_pos = 0;
ref_pos = fmt_sa(bid->fmt, ok[c].x[0]);
// ref_pos = bwt_sa(bid->bwt, ok[c].x[0]);
uint64_t leaf_data = ref_pos << 1;
memcpy_bwamem(&mlt_data[mlt_byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
mlt_byte_idx += 5;
*numHits += 1;
}
else {
xmer_data = ((lep1 & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->type = DIVERGE;
n->pos = 0;
n->num_bp = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize, __FILE__, __LINE__);
n->l_seq = kmerSize;
memcpy_bwamem(&n->seq[n->l_seq], xmerSize * sizeof(uint8_t), aq1, xmerSize * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq += xmerSize;
n->parent_node = 0;
n->numChildren = 0;
memset(n->child_nodes, 0, 4 * sizeof(node_t *));
n->start_addr = mlt_byte_idx;
ert_build_kmertree(bid, ik, ok, kmerSize + j, n, step, max_depth);
ert_traverse_kmertree(n, mlt_data, mh_data, &mlt_byte_idx, &mh_byte_idx, kmerSize + j, numHits,
max_next_ptr, next_ptr_width, step);
ert_destroy_kmertree(n);
}
if (num_hits < 20) {
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | (num_hits << 17) | xmer_data;
}
else {
xmer_entry = (mlt_offset << KMER_DATA_BITWIDTH) | xmer_data;
}
uint64_t ptr_width = (next_ptr_width < 4) ? next_ptr_width : 0;
xmer_entry |= (ptr_width << 22);
if (step == 1) {
memcpy_bwamem(&mlt_data[byte_idx], 8 * sizeof(uint8_t), &xmer_entry, 8 * sizeof(uint8_t), __FILE__, __LINE__);
}
byte_idx += 8;
mlt_offset = mlt_byte_idx;
ik = ik_copy;
prevHits = ik_copy.x[2];
}
*size = mlt_byte_idx;
*mh_size = mh_byte_idx;
}
void addCode(uint8_t *mlt_data, uint64_t *byte_idx, uint8_t code, int step)
{
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint8_t), &code, sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 1;
}
void addUniformNode(uint8_t *mlt_data, uint64_t *byte_idx, int count, uint8_t *uniformBases, uint64_t hitCount, int step)
{
int numBytesForBP = addBytesForEntry(UNIFORM_BP, count, 0);
assert(count < ERT_MAX_READ_LEN);
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], sizeof(uint16_t), &count, sizeof(uint16_t), __FILE__, __LINE__);
}
*byte_idx += 2;
if (step == 1) {
int j;
uint8_t packUniformBases[numBytesForBP];
memset(packUniformBases, 0, numBytesForBP * sizeof(uint8_t));
for (j = 0; j < count; ++j) {
_set_pac_orig(packUniformBases, j, uniformBases[j]);
}
memcpy_bwamem(&mlt_data[*byte_idx], numBytesForBP * sizeof(uint8_t), packUniformBases, numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += numBytesForBP;
}
void addLeafNode(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t ref_pos, int step) {
if (step == 1) {
uint64_t leaf_data = (ref_pos << 1);
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
void addMultiHitLeafNode(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, uint64_t* hits, int step)
{
uint16_t k = 0;
for (k = 0; k < count; ++k) {
if (step == 1) {
uint64_t leaf_data = (hits[k] << 1) | 1ULL;
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
}
void addMultiHitLeafCount(uint8_t* mlt_data, uint64_t* byte_idx, uint64_t count, int step)
{
if (step == 1) {
memcpy_bwamem(&mlt_data[*byte_idx], 2 * sizeof(uint8_t), &count, 2 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 2;
}
void addMultiHitLeafPtr(uint8_t *mlt_data, uint64_t *byte_idx, uint64_t mh_byte_idx, int step)
{
if (step == 1) {
uint64_t mh_data = (mh_byte_idx << 1) | 1ULL;
memcpy_bwamem(&mlt_data[*byte_idx], 5 * sizeof(uint8_t), &mh_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
}
*byte_idx += 5;
}
void ert_traverse_kmertree(node_t *n, uint8_t *mlt_data, uint8_t *mh_data, uint64_t *size, uint64_t *mh_size, int depth, uint64_t *numHits, uint64_t *max_ptr, uint64_t next_ptr_width, int step)
{
int j = 0;
int cur_depth = depth;
uint64_t byte_idx = *size;
uint64_t mh_byte_idx = *mh_size; // 匹配位置开始记录的索引位置
uint8_t code = 0;
assert(n->numChildren != 0);
if (n->numChildren == 1) {
node_t *child = n->child_nodes[0];
uint8_t c = child->seq[child->pos];
if (child->type == LEAF) {
//
// FIXME: In rare cases, when one of the occurrences of the k-mer is at the end of the reference,
// # hits for parent node is not equal to the sum of #hits of children nodes, and we trigger the assertion below
// This should not affect results as long as readLength > kmerSize
// assert(child->numHits > 1);
code |= (LEAF << (c << 1));
addCode(mlt_data, &byte_idx, code, step);
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
*numHits += child->numHits;
}
else {
assert(child->type == UNIFORM);
code |= (UNIFORM << (c << 1));
addCode(mlt_data, &byte_idx, code, step);
addUniformNode(mlt_data, &byte_idx, child->num_bp, &child->seq[child->pos], child->numHits, step);
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx, cur_depth + child->num_bp,
numHits, max_ptr, next_ptr_width, step);
}
}
else {
uint8_t numEmpty = 0, numLeaves = 0;
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
uint8_t c = child->seq[child->pos];
if (child->type == EMPTY) { numEmpty++; }
else if (child->type == LEAF) { numLeaves++; code |= (LEAF << (c << 1)); }
else { code |= (DIVERGE << (c << 1)); }
}
uint8_t numPointers = ((4 - numEmpty - numLeaves) > 0) ? (4 - numEmpty - numLeaves) : 0;
uint64_t start_byte_idx = byte_idx;
addCode(mlt_data, &byte_idx, code, step);
uint64_t ptr_byte_idx = byte_idx;
uint64_t ptrToOtherNodes[numPointers + 1]; //!< These point to children. We have one more child than number of pointers
memset(ptrToOtherNodes, 0, (numPointers + 1) * sizeof(uint64_t));
uint64_t numHitsForChildren[numPointers + 1];
memset(numHitsForChildren, 0, (numPointers + 1) * sizeof(uint64_t));
uint64_t other_idx = 0;
if (numPointers > 0) { byte_idx += (numPointers * next_ptr_width); }
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
if (child->type == LEAF) {
if (child->numHits == 1) {
addLeafNode(mlt_data, &byte_idx, child->hits[0], step);
}
else {
addMultiHitLeafPtr(mlt_data, &byte_idx, mh_byte_idx, step);
addMultiHitLeafCount(mh_data, &mh_byte_idx, child->numHits, step);
addMultiHitLeafNode(mh_data, &mh_byte_idx, child->numHits, child->hits, step);
}
}
}
if (numPointers > 0) { ptrToOtherNodes[other_idx] = byte_idx; }
for (j = 0; j < n->numChildren; ++j) {
node_t *child = n->child_nodes[j];
assert(child->type != UNIFORM); // why?
if (child->type == DIVERGE) {
ert_traverse_kmertree(child, mlt_data, mh_data, &byte_idx, &mh_byte_idx,
cur_depth+1, numHits, max_ptr, next_ptr_width, step);
numHitsForChildren[other_idx] = child->numHits;
other_idx++;
ptrToOtherNodes[other_idx] = byte_idx;
}
}
for (j = 0; j < numPointers; ++j) {
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
if (pointerToNextNode > *max_ptr) {
*max_ptr = pointerToNextNode;
}
assert(pointerToNextNode < (1 << 26));
}
// Fill up pointers based on size of previous children
if (step == 1) {
for (j = 0; j < numPointers; ++j) {
uint64_t pointerToNextNode = (ptrToOtherNodes[j] - start_byte_idx);
assert(pointerToNextNode < (1 << 26));
uint64_t reseed_data = 0;
if (numHitsForChildren[j] < 20) {
reseed_data = (pointerToNextNode << 6) | (numHitsForChildren[j]);
} else {
reseed_data = (pointerToNextNode << 6);
}
memcpy_bwamem(&mlt_data[ptr_byte_idx], next_ptr_width * sizeof(uint8_t), &reseed_data, next_ptr_width * sizeof(uint8_t), __FILE__, __LINE__);
ptr_byte_idx += next_ptr_width;
}
}
}
*size = byte_idx;
*mh_size = mh_byte_idx;
}
void ert_destroy_kmertree(node_t* n)
{
int j;
if (n == NULL) {
return;
}
if (n->hits) {
free(n->hits);
}
for (j = 0; j < n->numChildren; ++j) {
ert_destroy_kmertree(n->child_nodes[j]);
}
free(n);
}
//
// This function builds the ERT index.
// Note on pointers to child nodes: When building the radix tree for each k-mer,
// we try 3 values for pointers to child nodes, 2,3,4 B and choose the smallest
// one possible.
//
void *buildIndex(void *arg)
{
thread_data_t *data = (thread_data_t *)arg;
bwtintv_t ik, ok[4];
uint64_t idx = 0;
uint8_t aq[kmerSize];
int i;
uint8_t c;
uint64_t lep, prevHits, numBytesPerKmer, numBytesForMh, ref_pos, total_hits = 0, ptr = 0, max_next_ptr = 0;
uint64_t next_ptr_width = 0;
uint64_t nKmerSmallPtr = 0, nKmerMedPtr = 0, nKmerLargePtr = 0;
uint16_t kmer_data = 0;
// File to write the multi-level tree index
char ml_tbl_file_name[PATH_MAX];
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%d", data->filePrefix, data->tid);
// Log progress
char log_file_name[PATH_MAX];
sprintf(log_file_name, "%s.log_%d", data->filePrefix, data->tid);
FILE *ml_tbl_fd = 0, *log_fd = 0;
ml_tbl_fd = fopen(ml_tbl_file_name, "wb");
if (ml_tbl_fd == NULL)
{
printf("\nCan't open file or file doesn't exist. mlt_table errno = %d\n", errno);
pthread_exit(NULL);
}
if (bwa_verbose >= 4)
{
log_fd = fopen(log_file_name, "w");
if (log_fd == NULL)
{
printf("\nCan't open file or file doesn't exist. log errno = %d\n", errno);
pthread_exit(NULL);
}
log_file(log_fd, "Start: %lu End: %lu", data->startKmer, data->endKmer);
}
//
// Loop for each k-mer and compute LEP when the hit set changes
//
double tmp_time = realtime(), elapsed_time;
int p = 0;
for (idx = data->startKmer; idx < data->endKmer; ++idx)
{
max_next_ptr = 0;
next_ptr_width = 0;
c = 0;
lep = 0; // k-1-bit LEP
prevHits = 0;
numBytesPerKmer = 0;
numBytesForMh = 0;
kmertoquery(idx, aq, kmerSize); // represent k-mer as uint8_t*
assert(aq[0] >= 0 && aq[0] <= 3);
bwt_set_intv(data->bid->bwt, aq[0], ik); // the initial interval of a single base
ik.info = 1;
prevHits = ik.x[2];
// 打印进度信息
if (data->tid == 0 && (idx - data->startKmer) % ((data->endKmer - data->startKmer) / 100) == 0) {
if (p++ > 0) {
elapsed_time = realtime() - tmp_time;
fprintf(stderr, "[step: %d] %d%% percent complished. %f s elapsed.\n", data->step, p - 1, elapsed_time);
}
}
//
// Backward search k-mer
//
for (i = 1; i < kmerSize; ++i)
{
c = 3 - aq[i];
bwt_extend(data->bid->bwt, &ik, ok, 0); // ok contains the result of BWT extension
if (ok[c].x[2] != prevHits) { // hit set changes
lep |= (1ULL << (i - 1));
}
//
// Extend left till k-mer has zero hits
//
if (ok[c].x[2] >= 1) { prevHits = ok[c].x[2]; ik = ok[c]; ik.info = i + 1; }
else { break; }
}
uint64_t num_hits = ok[c].x[2];
if (num_hits == 0) { // "Empty" - k-mer absent in the reference genome
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | INVALID;
} else if (num_hits == 1) { // "Leaf" - k-mer has a single hit in the reference genome
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (SINGLE_HIT_LEAF);
numBytesPerKmer = 6; // 该15-kmer下所包含的所有匹配位置信息只需要6字节就能表示因为只有一个匹配
uint8_t byte_idx = 0;
uint8_t mlt_data[numBytesPerKmer];
if (data->step == 1) { mlt_data[byte_idx] = 0; } // Mark that the hit is not a multi-hit
byte_idx++;
data->numHits[idx - data->startKmer] += num_hits;
if (data->step == 1) {
// Look up suffix array to identify the hit position
ref_pos = fmt_sa(data->bid->fmt, ok[c].x[0]);
// ref_pos = bwt_sa(data->bid->bwt, ok[c].x[0]);
uint64_t leaf_data = ref_pos << 1;
memcpy_bwamem(&mlt_data[byte_idx], 5 * sizeof(uint8_t), &leaf_data, 5 * sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
}
byte_idx += 5;
}
//
// If the number of hits for the k-mer does not exceed the HIT_THRESHOLD,
// prefer a radix-tree over a multi-level table as the radix tree for the
// k-mer is likely to be sparse.
//
else if (num_hits <= HIT_THRESHOLD) {
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (INFREQUENT);
node_t *n = (node_t *)calloc(1, sizeof(node_t));
assert(n != NULL);
n->type = DIVERGE;
n->pos = 0;
n->num_bp = 0;
memcpy_bwamem(n->seq, ERT_MAX_READ_LEN * sizeof(uint8_t), aq, kmerSize * sizeof(uint8_t), __FILE__, __LINE__);
n->l_seq = kmerSize;
n->parent_node = 0;
n->numChildren = 0;
n->numHits = num_hits;
n->child_nodes[0] = n->child_nodes[1] = n->child_nodes[2] = n->child_nodes[3] = 0;
n->start_addr = 0;
uint8_t *mlt_data = 0;
next_ptr_width = 2;
uint8_t *mh_data = 0;
uint64_t size = 0;
if (data->step == 1) {
if (idx != (numKmers - 1)) { // 不等于最后一个处理的kmer
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH); // 本kmer所占存储大小
assert(size < (1 << 26)); // 小于64MB
} else { // FIXME: This is a hack. We know the size of every k-mer tree except the last-kmer
size = 1 << 26;
}
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mlt_data != NULL);
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mh_data != NULL);
}
ert_build_kmertree(data->bid, ik, ok, i, n, data->step, data->readLength - 1); // n作为parent node
// Reserve space for pointer to start of multi-hit address space
numBytesPerKmer = 4; // 4个字节的指针
// Traverse tree and place data in memory space
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh,
i, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
if (data->step == 0 || data->step == 1) {
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
next_ptr_width = 3;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
}
if (max_next_ptr >= 262144) {
next_ptr_width = 4;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_traverse_kmertree(n, mlt_data, mh_data, &numBytesPerKmer, &numBytesForMh, i,
&data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step);
}
}
ert_destroy_kmertree(n);
assert(numBytesPerKmer < (1 << 26));
// assert(numBytesForMh < (1 << 24));
if (data->step == 1) {
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
free(mlt_data);
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
free(mh_data);
}
}
//
// If the number of hits for the k-mer exceeds the HIT_THRESHOLD,
// prefer a multi-level table to encode the suffixes for the
// k-mer
//
else {
kmer_data = ((lep & LEP_MASK) << METADATA_BITWIDTH) | (FREQUENT);
uint8_t *mlt_data = 0;
uint8_t *mh_data = 0;
next_ptr_width = 2;
uint64_t size = 0;
if (data->step == 1) {
if (idx != (numKmers - 1)) {
size = (data->byte_offsets[idx+1] >> KMER_DATA_BITWIDTH) - (data->byte_offsets[idx] >> KMER_DATA_BITWIDTH);
assert(size < (1 << 26));
}
else { //!< FIXME: Hack. We do not store the size of the last-kmer
size = 1 << 26;
}
next_ptr_width = (((data->byte_offsets[idx] >> 22) & 3) == 0) ? 4 : ((data->byte_offsets[idx] >> 22) & 3);
mlt_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mlt_data != NULL);
mh_data = (uint8_t *)calloc(size, sizeof(uint8_t));
assert(mh_data != NULL);
}
numBytesPerKmer = 4;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data, &numBytesPerKmer,
&numBytesForMh, aq, &data->numHits[idx - data->startKmer], &max_next_ptr, next_ptr_width, data->step,
data->readLength - 1);
if (data->step == 0 || data->step == 1) {
if (max_next_ptr >= 1024 && max_next_ptr < 262144) {
next_ptr_width = 3;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
}
if (max_next_ptr >= 262144) {
next_ptr_width = 4;
max_next_ptr = 0;
numBytesPerKmer = 4;
numBytesForMh = 0;
ert_build_table(data->bid, ik, ok, mlt_data, mh_data,
&numBytesPerKmer, &numBytesForMh, aq, &data->numHits[idx-data->startKmer],
&max_next_ptr, next_ptr_width, data->step, data->readLength - 1);
}
}
//
// Traverse tree and place data in memory
//
assert(numBytesPerKmer < (1 << 26));
// assert(numBytesForMh < (1 << 24));
if (data->step == 1) {
if (idx != numKmers-1) assert((numBytesPerKmer+numBytesForMh) == size);
memcpy_bwamem(mlt_data, 4*sizeof(uint8_t), &numBytesPerKmer, 4*sizeof(uint8_t), __FILE__, __LINE__);
fwrite(mlt_data, sizeof(uint8_t), numBytesPerKmer, ml_tbl_fd);
free(mlt_data);
fwrite(mh_data, sizeof(uint8_t), numBytesForMh, ml_tbl_fd);
free(mh_data);
}
}
if (num_hits < 20) {
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | (num_hits << 17) | kmer_data;
}
else {
data->kmer_table[idx - data->startKmer] = (ptr << KMER_DATA_BITWIDTH) | kmer_data;
}
ptr += (numBytesPerKmer + numBytesForMh);
if (next_ptr_width == 2) {
nKmerSmallPtr++;
}
else if (next_ptr_width == 3) {
nKmerMedPtr++;
}
else if (next_ptr_width == 4) {
nKmerLargePtr++;
next_ptr_width = 0;
}
data->kmer_table[idx - data->startKmer] |= (next_ptr_width << 22);
if (bwa_verbose >= 4) {
if (idx == data->endKmer-1) {
log_file(log_fd, "TotalSize:%lu\n", ptr);
}
if ((idx-data->startKmer) % 10000000 == 0) {
log_file(log_fd, "%lu,%lu,%lu", idx, numBytesPerKmer, ptr);
}
}
total_hits += data->numHits[idx - data->startKmer];
}
data->end_offset = ptr;
if (bwa_verbose >= 4) {
log_file(log_fd, "Hits:%lu\n", total_hits);
log_file(log_fd, "nKmersSmallPtrs:%lu", nKmerSmallPtr);
log_file(log_fd, "nKmersMedPtrs:%lu", nKmerMedPtr);
log_file(log_fd, "nKmersLargePtrs:%lu", nKmerLargePtr);
fclose(log_fd);
}
fclose(ml_tbl_fd);
pthread_exit(NULL);
}
void buildERTKmerTrees(char *kmer_tbl_file_name, bwaidx_t *bid, char *prefix, int num_threads, int readLength)
{
FILE *kmer_tbl_fd;
pthread_t thr[num_threads];
int i, rc;
thread_data_t thr_data[num_threads];
uint64_t numKmersThread = (uint64_t)ceil(((double)(numKmers)) / num_threads);
if (bwa_verbose >= 3)
{
fprintf(stderr, "[M::%s] Computing tree sizes for each k-mer\n", __func__);
}
//
// STEP 1: Create threads. Each thread builds the index for a fraction of the k-mers
//
for (i = 0; i < num_threads; ++i)
{
thr_data[i].tid = i;
thr_data[i].step = 0;
thr_data[i].readLength = readLength;
thr_data[i].bid = bid;
thr_data[i].startKmer = i * numKmersThread;
thr_data[i].endKmer = ((i + 1) * numKmersThread > numKmers) ? numKmers : (i + 1) * numKmersThread;
thr_data[i].end_offset = 0;
thr_data[i].filePrefix = prefix;
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
thr_data[i].kmer_table = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
assert(thr_data[i].kmer_table != NULL);
thr_data[i].numHits = (uint64_t *)calloc(numKmersToProcess, sizeof(uint64_t));
assert(thr_data[i].numHits != NULL);
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i])))
{
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
return;
}
}
//
// block until all threads complete
//
for (i = 0; i < num_threads; ++i)
{
pthread_join(thr[i], NULL);
}
//
// Compute absolute offsets for each kmer's tree from per-thread relative offsets
//
uint64_t *kmer_table = (uint64_t *)calloc(numKmers, sizeof(uint64_t));
assert(kmer_table != NULL);
int tidx;
uint64_t kidx;
uint64_t numProcessed = 0;
uint64_t offset = 0;
for (tidx = 0; tidx < num_threads; ++tidx)
{
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
for (kidx = 0; kidx < numKmersToProcess; ++kidx)
{
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
| (ptr_width << 22)
| (reseed_hits << 17)
| (kmer_data);
}
numProcessed += numKmersToProcess;
offset += thr_data[tidx].end_offset;
free(thr_data[tidx].kmer_table);
free(thr_data[tidx].numHits);
}
//
// STEP 2 : Using estimates of each k-mer's tree size from the previous step, write the index to file
//
uint64_t total_size = offset + (numKmers * 8UL);
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s] Total size of ERT index = %lu B (Expected). (k-mer,tree) = (%lu,%lu)\n", __func__, total_size, numKmers * 8UL, offset);
}
// return;
for (i = 0; i < num_threads; ++i) {
thr_data[i].tid = i;
thr_data[i].step = 1;
thr_data[i].readLength = readLength;
thr_data[i].bid = bid;
thr_data[i].startKmer = i*numKmersThread;
thr_data[i].endKmer = ((i + 1)*numKmersThread > numKmers) ? numKmers : (i + 1)*numKmersThread;
thr_data[i].end_offset = 0;
thr_data[i].filePrefix = prefix;
uint64_t numKmersToProcess = thr_data[i].endKmer - thr_data[i].startKmer;
thr_data[i].kmer_table = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
thr_data[i].numHits = (uint64_t*) calloc(numKmersToProcess, sizeof(uint64_t));
thr_data[i].byte_offsets = kmer_table;
if ((rc = pthread_create(&thr[i], NULL, buildIndex, &thr_data[i]))) {
fprintf(stderr, "[M::%s] error: pthread_create, rc: %d\n", __func__, rc);
return;
}
}
for (i = 0; i < num_threads; ++i) {
pthread_join(thr[i], NULL);
}
if (bwa_verbose >= 3) {
fprintf(stderr, "[M::%s] Merging per-thread tables ...\n", __func__);
}
//
// Compute absolute offsets for each k-mer tree's root node
//
numProcessed = 0;
offset = 0;
for (tidx = 0; tidx < num_threads; ++tidx) {
uint64_t numKmersToProcess = thr_data[tidx].endKmer - thr_data[tidx].startKmer;
for (kidx = 0; kidx < numKmersToProcess; ++kidx) {
uint64_t rel_offset = thr_data[tidx].kmer_table[kidx] >> KMER_DATA_BITWIDTH;
uint16_t kmer_data = thr_data[tidx].kmer_table[kidx] & KMER_DATA_MASK;
uint64_t ptr_width = (thr_data[tidx].kmer_table[kidx] >> 22) & 3;
uint64_t reseed_hits = (thr_data[tidx].kmer_table[kidx] >> 17) & 0x1F;
kmer_table[numProcessed + kidx] = ((offset + rel_offset) << KMER_DATA_BITWIDTH)
| (ptr_width << 22)
| (reseed_hits << 17)
| (kmer_data);
}
numProcessed += numKmersToProcess;
offset += thr_data[tidx].end_offset;
free(thr_data[tidx].kmer_table);
free(thr_data[tidx].numHits);
}
kmer_tbl_fd = fopen(kmer_tbl_file_name, "wb");
if (kmer_tbl_fd == NULL) {
fprintf(stderr, "[M::%s] Can't open file or file doesn't exist.\n", __func__);
exit(1);
}
fwrite(kmer_table, sizeof(uint64_t), numKmers, kmer_tbl_fd);
fclose(kmer_tbl_fd);
free(kmer_table);
//
// Merge all per-thread trees
//
const int file_buf_size = 64 * 1024 * 1024;
uint8_t *file_buf = (uint8_t *)malloc(file_buf_size); // 64MB
char ml_tbl_file_name[PATH_MAX] = {};
sprintf(ml_tbl_file_name, "%s.ert.mlt.table", prefix);
if (remove(ml_tbl_file_name) == 0) {
fprintf(stderr, "[M::%s] Overwriting existing index file (tree)\n", __func__);
}
FILE *o_mlt = fopen(ml_tbl_file_name, "wb");
if (o_mlt == NULL)
{
fprintf(stderr, "[M::%s] Can't open output index file for writing.\n", __func__);
exit(1);
}
for (uint64_t tidx = 0; tidx < num_threads; ++tidx) {
sprintf(ml_tbl_file_name, "%s.ert.mlt_table_%ld", prefix, tidx);
FILE *i_mlt = fopen(ml_tbl_file_name, "rb");
if (i_mlt == NULL) {
fprintf(stderr, "[M::%s] Can't open per-thread index file for thread %ld\n", __func__, tidx);
exit(1);
}
int fr = 0;
while((fr = fread(file_buf, 1, file_buf_size, i_mlt)) != 0) {
fwrite(file_buf, 1, fr, o_mlt);
}
if (remove(ml_tbl_file_name) != 0) {
fprintf(stderr, "[M::%s] Can't remove per-thread index file (tree) for thread %ld\n", __func__, tidx);
exit(1);
}
}
free(file_buf);
}