FastDup/src/util/bam_buf.cpp

248 lines
7.2 KiB
C++
Raw Normal View History

/*
Description: sam/bambuf
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/27
*/
#include "bam_buf.h"
/*
* BamBuf
*/
// 读取数据直到读完,或者缓冲区满
int BamBuf::ReadBam() {
int read_num = 0;
if (handle_last) { // 处理上次读入的最后一个bam
if (has_enough_space()) { // 必须调用在边界处调整memffset
++read_num;
append_one_bam();
} else {
return read_num; // 还是没空间
}
}
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
if (has_enough_space()) { // 还有空间
// if (true) { // 还有空间
append_one_bam();
++read_num; // 放进缓存才算读取到
} else {
break;
}
}
if (read_stat_ >= 0) {
handle_last = true;
} else {
handle_last = false;
}
return read_num;
}
// 初始化缓存
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
this->fp = fp;
this->hdr = hdr;
this->mem_size = mem_size;
this->mem = (uint8_t *)malloc(mem_size);
this->bw = (BamWrap *)malloc(sizeof(BamWrap));
this->bw->b = bam_init1();
if (bw == NULL || this->mem == NULL || this->bw->b == NULL) {
fprintf(stderr, "allocate memory failed! Abort\n");
exit(-1);
}
}
void BamBuf::ClearBeforeIdx(size_t idxInBv) {
if (idxInBv < 1)
return;
int i = 0, j = idxInBv;
for (; j < bv.size(); ++i, ++j) {
bv[i] = bv[j];
}
bv.resize(i);
prepare_read();
}
void BamBuf::ClearAll() {
bv.clear();
prepare_read();
}
// 为下一次读取做准备, 计算一些边界条件
inline void BamBuf::prepare_read() {
// 计算余留的下次计算可能用到的bam所占的位置
if (bv.size() > 0) {
BamWrap *bw = bv[0];
legacy_start = (int64_t)bw - (int64_t)mem;
bw = bv.back();
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
} else {
legacy_start = legacy_end = 0;
mem_offset = 0; // 上次没剩下,那就从头存储
}
}
// 检查缓存是否还有空间
inline bool BamBuf::has_enough_space() {
const uint32_t bam_len = bw->length();
int64_t potential_end = mem_offset + bam_len;
if (legacy_start <= legacy_end)
legacy_start += mem_size;
if (potential_end >= legacy_start) {
return false;
}
if (potential_end >= mem_size) {
mem_offset = 0;
}
int64_t virtual_offset = mem_offset;
if (virtual_offset < legacy_end)
virtual_offset += mem_size;
potential_end = virtual_offset + bam_len;
return potential_end < legacy_start;
}
// 处理一个读取后的bam
inline void BamBuf::append_one_bam() {
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
*bwp = *bw;
bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
bam1_t *bp = bwp->b;
*bp = *bw->b;
bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t));
memcpy(bp->data, bw->b->data, bw->b->l_data);
// 更新下次存储的位置
mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1));
bv.push_back(bwp);
}
// 处理上次读入的最后一个read
inline bool BamBuf::handle_last_read() {
if (handle_last) { // 处理上次读入的最后一个bam
if (has_enough_space()) { // 必须调用在边界处调整memffset
append_one_bam();
handle_last = false;
return true;
}
}
return false;
}
/*
* AsyncIoBamBuf
*/
// 初始化缓存
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
if (use_async_io_) {
buf1_.Init(fp, hdr, mem_size >> 1);
buf2_.Init(fp, hdr, mem_size >> 1);
pi_ = &buf1_;
po_ = &buf2_;
tid_ = (pthread_t *)malloc(sizeof(pthread_t));
} else {
buf1_.Init(fp, hdr, mem_size);
pi_ = &buf1_;
}
}
// 读取数据
int AsyncIoBamBuf::ReadBam() {
if (use_async_io_) {
hasThread = true;
return async_read_bam();
} else {
return sync_read_bam();
}
}
int AsyncIoBamBuf::sync_read_bam() {
int read_num = 0;
if (clear_all_) {
clear_all_ = false;
pi_->ClearAll();
} else if (clear_before_idx_ > 0) {
pi_->ClearBeforeIdx(clear_before_idx_);
clear_before_idx_ = 0;
}
read_num = pi_->ReadBam();
refresh_bam_arr();
return read_num;
}
int AsyncIoBamBuf::async_read_bam() {
int read_num = 0;
if (first_read_) {
read_num = pi_->ReadBam();
first_read_ = false;
refresh_bam_arr();
} else {
// join, 交换缓冲区指针
pthread_join(*tid_, 0);
resize_buf();
if (need_read_) { // 需要交换指针
BamBuf *tmp = pi_;
pi_ = po_;
po_ = tmp;
}
read_num = last_read_num_;
refresh_bam_arr();
}
// 异步读
pthread_create(tid_, 0, async_read, this);
return read_num;
}
void *AsyncIoBamBuf::async_read(void *data) {
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取
ab->last_read_num_ = ab->po_->ReadBam();
} else {
ab->last_read_num_ = 0;
}
pthread_exit(0);
}
// 为下一次读取做准备,
// 计算一些边界条件延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; }
// 清空上一次所有读入的数据延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
inline void AsyncIoBamBuf::resize_buf() {
if (clear_all_) { // 清理上一轮的数据
clear_all_ = false;
po_->ClearBeforeIdx(legacy_size_);
pi_->ClearAll();
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
last_read_num_ += 1;
legacy_size_ = pi_->Size(); // 应该只有一个read
need_read_ = true;
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
legacy_size_ = 0;
need_read_ = false;
}
} else if (clear_before_idx_ > 0) {
if (clear_before_idx_ < legacy_size_) {
po_->ClearBeforeIdx(clear_before_idx_);
legacy_size_ -= clear_before_idx_;
// 不需要交换指针,不需要读取
need_read_ = false;
} else {
po_->ClearBeforeIdx(legacy_size_);
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
last_read_num_ += 1;
legacy_size_ = pi_->Size(); // 应该只有一个read
need_read_ = true;
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
legacy_size_ = 0;
need_read_ = false;
}
}
clear_before_idx_ = 0;
}
}