FastBQSR/src/bqsr/bqsr_entry.cpp

153 lines
4.3 KiB
C++
Raw Normal View History

2025-11-23 23:03:37 +08:00
/*
Description:
bambambam
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/10/23
*/
#include <htslib/sam.h>
2025-12-04 22:26:13 +08:00
#include <htslib/synced_bcf_reader.h>
2025-11-23 23:03:37 +08:00
#include <htslib/thread_pool.h>
#include <spdlog/spdlog.h>
#include <iomanip>
#include <vector>
#include "bqsr_args.h"
#include "bqsr_funcs.h"
#include "bqsr_pipeline.h"
2025-12-04 22:26:13 +08:00
#include "dup_metrics.h"
#include "fastbqsr_version.h"
2025-11-23 23:03:37 +08:00
#include "read_name_parser.h"
#include "util/profiling.h"
2025-12-04 22:26:13 +08:00
#include "util/utils.h"
2025-11-23 23:03:37 +08:00
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
namespace nsgv {
2025-12-04 22:26:13 +08:00
2025-11-23 23:03:37 +08:00
std::vector<ReadNameParser> gNameParsers; // read name parser
2025-12-04 22:26:13 +08:00
DuplicationMetrics gMetrics; //
DupResult gDupRes;
PipelineArg gPipe(&gDupRes);
BQSRArg gBqsrArg; //
2025-11-23 23:03:37 +08:00
samFile *gInBamFp; // bam
sam_hdr_t *gInBamHeader; // bam
samFile *gOutBamFp; // , sambam
sam_hdr_t *gOutBamHeader; // header
2025-12-04 22:26:13 +08:00
vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
2025-11-23 23:03:37 +08:00
}; // namespace nsgv
//
struct ByteBuf {
uint8_t *buf = nullptr;
int size = 0; //
int capacity = 0; //
};
/*
*
*/
static string getFileExtension(const string &filename) {
auto last_dot = filename.find_last_of('.');
if (last_dot == string::npos) {
return "";
}
return filename.substr(last_dot + 1);
}
2025-12-04 22:26:13 +08:00
// 串行bqsr
int SerialBQSR() {
int round = 0;
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
int64_t readNumSum = 0;
while (1) {
++ round;
size_t readNum = 0;
if (inBamBuf.ReadStat() >= 0)
readNum = inBamBuf.ReadBam();
if (readNum < 1) {
break;
}
spdlog::info("{} reads processed in {} round", readNum, round);
auto bams = inBamBuf.GetBamArr();
spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end());
// 1. 获取bams数组覆盖的region范围
// 2. 开辟一个uint32_t的数组作为bitmap如果上一轮的不够就重开用来表示region的每个位点是否有known sites覆盖每轮使用前需清零
// 3. 读取在region范围内的所有known sites并为对应的bitmap设定0 or 1 (作为skip标识)
// 4. 遍历bams数组中的每一条记录并进行处理
readNumSum += readNum;
inBamBuf.ClearAll(); //
}
spdlog::info("read count: {}", readNumSum);
return 0;
}
2025-11-23 23:03:37 +08:00
// entrance of mark duplicates
2025-12-04 22:26:13 +08:00
int BaseRecalibrator() {
2025-11-23 23:03:37 +08:00
PROF_START(whole_process);
/* bam */
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
if (!nsgv::gInBamFp) {
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
return -1;
}
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
2025-12-04 22:26:13 +08:00
2025-11-23 23:03:37 +08:00
// (libraryId)
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
2025-12-04 22:26:13 +08:00
/* 并行读取bam数据 */
2025-11-23 23:03:37 +08:00
htsThreadPool htsPoolRead = {NULL, 0}; //
htsThreadPool htsPoolWrite = {NULL, 0}; //
htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
htsPoolWrite.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
spdlog::error("[{}] failed to set up thread pool", __LINE__);
sam_close(nsgv::gInBamFp);
return -1;
}
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
2025-12-04 22:26:13 +08:00
return SerialBQSR();
2025-11-23 23:03:37 +08:00
2025-12-04 22:26:13 +08:00
// // 读取known sites vcfs
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
// spdlog::info(" {}", ks);
// bcf_srs_t* srs = bcf_sr_init();
// if (!bcf_sr_add_reader(srs, ks.c_str()))
// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
// nsgv::gKnownSitesVcfSrs.push_back(srs);
//
// while (bcf_sr_next_line(srs)) {
// bcf1_t* line = srs->readers[0].buffer[0];
// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
// }
// }
//
// /* 先实现串行的bqsr-phase-1 */
2025-11-23 23:03:37 +08:00
sam_close(nsgv::gInBamFp);
PROF_END(gprof[GP_whole_process], whole_process);
return 0;
}