322 lines
7.5 KiB
C++
322 lines
7.5 KiB
C++
|
|
/*
|
|||
|
|
Description: 读入sam/bam时,开辟一个大的buf,存放这些数据
|
|||
|
|
|
|||
|
|
Copyright : All right reserved by ICT
|
|||
|
|
|
|||
|
|
Author : Zhang Zhonghai
|
|||
|
|
Date : 2019/11/27
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
#include "bam_buf.h"
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
* BamBuf类
|
|||
|
|
*/
|
|||
|
|
// 读取数据直到读完,或者缓冲区满
|
|||
|
|
int BamBuf::ReadBam()
|
|||
|
|
{
|
|||
|
|
int read_num = 0;
|
|||
|
|
if (handle_last)
|
|||
|
|
{ // 处理上次读入的最后一个bam
|
|||
|
|
if (has_enough_space())
|
|||
|
|
{ // 必须调用,在边界处调整memffset
|
|||
|
|
++read_num;
|
|||
|
|
append_one_bam();
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
return read_num; // 还是没空间
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0)
|
|||
|
|
{
|
|||
|
|
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
|
|||
|
|
if (has_enough_space())
|
|||
|
|
{ // 还有空间
|
|||
|
|
append_one_bam();
|
|||
|
|
++read_num; // 放进缓存才算读取到
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (read_stat_ >= 0)
|
|||
|
|
{
|
|||
|
|
handle_last = true;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
handle_last = false;
|
|||
|
|
}
|
|||
|
|
return read_num;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 初始化缓存
|
|||
|
|
void BamBuf::Init(samFile *fp,
|
|||
|
|
sam_hdr_t *hdr,
|
|||
|
|
int64_t mem_size)
|
|||
|
|
{
|
|||
|
|
this->fp = fp;
|
|||
|
|
this->hdr = hdr;
|
|||
|
|
this->mem_size = mem_size;
|
|||
|
|
this->mem = (uint8_t *)malloc(mem_size);
|
|||
|
|
this->bw = (BamWrap *)malloc(sizeof(BamWrap));
|
|||
|
|
this->bw->b = bam_init1();
|
|||
|
|
if (bw == NULL ||
|
|||
|
|
this->mem == NULL ||
|
|||
|
|
this->bw->b == NULL)
|
|||
|
|
{
|
|||
|
|
fprintf(stderr, "allocate memory failed! Abort\n");
|
|||
|
|
exit(-1);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void BamBuf::ClearBeforeIdx(size_t idxInBv)
|
|||
|
|
{
|
|||
|
|
if (idxInBv < 1)
|
|||
|
|
return;
|
|||
|
|
int i = 0, j = idxInBv;
|
|||
|
|
for (; j < bv.size(); ++i, ++j)
|
|||
|
|
{
|
|||
|
|
bv[i] = bv[j];
|
|||
|
|
}
|
|||
|
|
bv.resize(i);
|
|||
|
|
prepare_read();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void BamBuf::ClearAll()
|
|||
|
|
{
|
|||
|
|
|
|||
|
|
bv.clear();
|
|||
|
|
prepare_read();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 为下一次读取做准备, 计算一些边界条件
|
|||
|
|
inline void BamBuf::prepare_read()
|
|||
|
|
{
|
|||
|
|
// 计算余留的下次计算可能用到的bam所占的位置
|
|||
|
|
if (bv.size() > 0)
|
|||
|
|
{
|
|||
|
|
BamWrap *bw = bv[0];
|
|||
|
|
legacy_start = (int64_t)bw - (int64_t)mem;
|
|||
|
|
bw = bv.back();
|
|||
|
|
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
legacy_start = legacy_end = 0;
|
|||
|
|
mem_offset = 0; // 上次没剩下,那就从头存储
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 检查缓存是否还有空间
|
|||
|
|
inline bool BamBuf::has_enough_space()
|
|||
|
|
{
|
|||
|
|
const uint32_t bam_len = bw->length();
|
|||
|
|
int64_t potential_end = mem_offset + bam_len;
|
|||
|
|
if (legacy_start <= legacy_end)
|
|||
|
|
legacy_start += mem_size;
|
|||
|
|
if (potential_end >= legacy_start)
|
|||
|
|
{
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
if (potential_end >= mem_size)
|
|||
|
|
{
|
|||
|
|
mem_offset = 0;
|
|||
|
|
}
|
|||
|
|
int64_t virtual_offset = mem_offset;
|
|||
|
|
if (virtual_offset < legacy_end)
|
|||
|
|
virtual_offset += mem_size;
|
|||
|
|
potential_end = virtual_offset + bam_len;
|
|||
|
|
return potential_end < legacy_start;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 处理一个读取后的bam
|
|||
|
|
inline void BamBuf::append_one_bam()
|
|||
|
|
{
|
|||
|
|
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
|
|||
|
|
*bwp = *bw;
|
|||
|
|
bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
|
|||
|
|
bam1_t *bp = bwp->b;
|
|||
|
|
*bp = *bw->b;
|
|||
|
|
bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t));
|
|||
|
|
memcpy(bp->data, bw->b->data, bw->b->l_data);
|
|||
|
|
// 更新下次存储的位置
|
|||
|
|
mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1));
|
|||
|
|
|
|||
|
|
// cout << "size: " << bv.size() << " " << buf_name << endl;
|
|||
|
|
bv.push_back(bwp);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 处理上次读入的最后一个read
|
|||
|
|
inline bool BamBuf::handle_last_read()
|
|||
|
|
{
|
|||
|
|
if (handle_last)
|
|||
|
|
{ // 处理上次读入的最后一个bam
|
|||
|
|
if (has_enough_space())
|
|||
|
|
{ // 必须调用,在边界处调整memffset
|
|||
|
|
append_one_bam();
|
|||
|
|
handle_last = false;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
* AsyncIoBamBuf 类
|
|||
|
|
*/
|
|||
|
|
// 初始化缓存
|
|||
|
|
void AsyncIoBamBuf::Init(samFile *fp,
|
|||
|
|
sam_hdr_t *hdr,
|
|||
|
|
int64_t mem_size)
|
|||
|
|
{
|
|||
|
|
if (use_async_io_)
|
|||
|
|
{
|
|||
|
|
buf1_.Init(fp, hdr, mem_size >> 1);
|
|||
|
|
buf2_.Init(fp, hdr, mem_size >> 1);
|
|||
|
|
pi_ = &buf1_;
|
|||
|
|
po_ = &buf2_;
|
|||
|
|
tid_ = (pthread_t *)malloc(sizeof(pthread_t));
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
buf1_.Init(fp, hdr, mem_size);
|
|||
|
|
pi_ = &buf1_;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 读取数据
|
|||
|
|
int AsyncIoBamBuf::ReadBam()
|
|||
|
|
{
|
|||
|
|
if (use_async_io_)
|
|||
|
|
{
|
|||
|
|
return async_read_bam();
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
return sync_read_bam();
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int AsyncIoBamBuf::sync_read_bam()
|
|||
|
|
{
|
|||
|
|
int read_num = 0;
|
|||
|
|
if (clear_all_)
|
|||
|
|
{
|
|||
|
|
clear_all_ = false;
|
|||
|
|
pi_->ClearAll();
|
|||
|
|
}
|
|||
|
|
else if (clear_before_idx_ > 0)
|
|||
|
|
{
|
|||
|
|
pi_->ClearBeforeIdx(clear_before_idx_);
|
|||
|
|
clear_before_idx_ = 0;
|
|||
|
|
}
|
|||
|
|
read_num = pi_->ReadBam();
|
|||
|
|
refresh_bam_arr();
|
|||
|
|
return read_num;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int AsyncIoBamBuf::async_read_bam()
|
|||
|
|
{
|
|||
|
|
int read_num = 0;
|
|||
|
|
if (first_read_)
|
|||
|
|
{
|
|||
|
|
read_num = pi_->ReadBam();
|
|||
|
|
first_read_ = false;
|
|||
|
|
refresh_bam_arr();
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
// join, 交换缓冲区指针
|
|||
|
|
pthread_join(*tid_, 0);
|
|||
|
|
resize_buf();
|
|||
|
|
|
|||
|
|
if (need_read_)
|
|||
|
|
{ // 需要交换指针
|
|||
|
|
BamBuf *tmp = pi_;
|
|||
|
|
pi_ = po_;
|
|||
|
|
po_ = tmp;
|
|||
|
|
}
|
|||
|
|
read_num = last_read_num_;
|
|||
|
|
refresh_bam_arr();
|
|||
|
|
}
|
|||
|
|
// 异步读
|
|||
|
|
pthread_create(tid_, 0, async_read, this);
|
|||
|
|
return read_num;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void *AsyncIoBamBuf::async_read(void *data)
|
|||
|
|
{
|
|||
|
|
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
|
|||
|
|
if (ab->need_read_ && ab->ReadStat() >= 0)
|
|||
|
|
{ // 需要读取
|
|||
|
|
ab->last_read_num_ = ab->po_->ReadBam();
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
ab->last_read_num_ = 0;
|
|||
|
|
}
|
|||
|
|
pthread_exit(0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 为下一次读取做准备, 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取
|
|||
|
|
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv)
|
|||
|
|
{
|
|||
|
|
clear_before_idx_ = idxInBv;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取
|
|||
|
|
void AsyncIoBamBuf::ClearAll()
|
|||
|
|
{
|
|||
|
|
clear_all_ = true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
inline void AsyncIoBamBuf::resize_buf()
|
|||
|
|
{
|
|||
|
|
if (clear_all_)
|
|||
|
|
{ // 清理上一轮的数据
|
|||
|
|
clear_all_ = false;
|
|||
|
|
po_->ClearBeforeIdx(legacy_size_);
|
|||
|
|
pi_->ClearAll();
|
|||
|
|
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
|
|||
|
|
{
|
|||
|
|
last_read_num_ += 1;
|
|||
|
|
legacy_size_ = pi_->Size(); // 应该只有一个read
|
|||
|
|
need_read_ = true;
|
|||
|
|
}
|
|||
|
|
else // 没空间存放,则不交换指针,或者文件已经读取完毕
|
|||
|
|
{
|
|||
|
|
legacy_size_ = 0;
|
|||
|
|
need_read_ = false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else if (clear_before_idx_ > 0)
|
|||
|
|
{
|
|||
|
|
if (clear_before_idx_ < legacy_size_)
|
|||
|
|
{
|
|||
|
|
po_->ClearBeforeIdx(clear_before_idx_);
|
|||
|
|
legacy_size_ -= clear_before_idx_;
|
|||
|
|
// 不需要交换指针,不需要读取
|
|||
|
|
need_read_ = false;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
po_->ClearBeforeIdx(legacy_size_);
|
|||
|
|
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
|
|||
|
|
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
|
|||
|
|
{
|
|||
|
|
last_read_num_ += 1;
|
|||
|
|
legacy_size_ = pi_->Size(); // 应该只有一个read
|
|||
|
|
need_read_ = true;
|
|||
|
|
}
|
|||
|
|
else // 没空间存放,则不交换指针,或者文件已经读取完毕
|
|||
|
|
{
|
|||
|
|
legacy_size_ = 0;
|
|||
|
|
need_read_ = false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
clear_before_idx_ = 0;
|
|||
|
|
}
|
|||
|
|
}
|