2025-11-23 23:03:37 +08:00
|
|
|
|
/*
|
|
|
|
|
|
Description:
|
|
|
|
|
|
bam,bam,bam
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/10/23
|
|
|
|
|
|
*/
|
|
|
|
|
|
#include <htslib/sam.h>
|
2025-12-04 22:26:13 +08:00
|
|
|
|
#include <htslib/synced_bcf_reader.h>
|
2025-11-23 23:03:37 +08:00
|
|
|
|
#include <htslib/thread_pool.h>
|
|
|
|
|
|
#include <spdlog/spdlog.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include <iomanip>
|
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
|
|
|
|
#include "bqsr_args.h"
|
|
|
|
|
|
#include "bqsr_funcs.h"
|
|
|
|
|
|
#include "bqsr_pipeline.h"
|
2025-12-04 22:26:13 +08:00
|
|
|
|
#include "dup_metrics.h"
|
|
|
|
|
|
#include "fastbqsr_version.h"
|
2025-11-23 23:03:37 +08:00
|
|
|
|
#include "read_name_parser.h"
|
|
|
|
|
|
#include "util/profiling.h"
|
2025-12-04 22:26:13 +08:00
|
|
|
|
#include "util/utils.h"
|
2025-11-23 23:03:37 +08:00
|
|
|
|
|
|
|
|
|
|
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
|
|
|
|
|
|
|
|
|
|
|
namespace nsgv {
|
|
|
|
|
|
|
2025-12-04 22:26:13 +08:00
|
|
|
|
|
2025-11-23 23:03:37 +08:00
|
|
|
|
std::vector<ReadNameParser> gNameParsers; // read name parser
|
2025-12-04 22:26:13 +08:00
|
|
|
|
DuplicationMetrics gMetrics; //
|
|
|
|
|
|
DupResult gDupRes;
|
|
|
|
|
|
PipelineArg gPipe(&gDupRes);
|
|
|
|
|
|
|
|
|
|
|
|
BQSRArg gBqsrArg; //
|
2025-11-23 23:03:37 +08:00
|
|
|
|
samFile *gInBamFp; // bam
|
|
|
|
|
|
sam_hdr_t *gInBamHeader; // bam
|
|
|
|
|
|
samFile *gOutBamFp; // , sambam
|
|
|
|
|
|
sam_hdr_t *gOutBamHeader; // header
|
2025-12-04 22:26:13 +08:00
|
|
|
|
vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
|
2025-11-23 23:03:37 +08:00
|
|
|
|
}; // namespace nsgv
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
struct ByteBuf {
|
|
|
|
|
|
uint8_t *buf = nullptr;
|
|
|
|
|
|
int size = 0; //
|
|
|
|
|
|
int capacity = 0; //
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
*
|
|
|
|
|
|
*/
|
|
|
|
|
|
static string getFileExtension(const string &filename) {
|
|
|
|
|
|
auto last_dot = filename.find_last_of('.');
|
|
|
|
|
|
if (last_dot == string::npos) {
|
|
|
|
|
|
return "";
|
|
|
|
|
|
}
|
|
|
|
|
|
return filename.substr(last_dot + 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-04 22:26:13 +08:00
|
|
|
|
// 串行bqsr
|
|
|
|
|
|
int SerialBQSR() {
|
|
|
|
|
|
int round = 0;
|
|
|
|
|
|
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
|
|
|
|
|
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
|
|
|
|
|
|
int64_t readNumSum = 0;
|
|
|
|
|
|
while (1) {
|
|
|
|
|
|
++ round;
|
|
|
|
|
|
size_t readNum = 0;
|
|
|
|
|
|
if (inBamBuf.ReadStat() >= 0)
|
|
|
|
|
|
readNum = inBamBuf.ReadBam();
|
|
|
|
|
|
if (readNum < 1) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
spdlog::info("{} reads processed in {} round", readNum, round);
|
|
|
|
|
|
|
|
|
|
|
|
auto bams = inBamBuf.GetBamArr();
|
|
|
|
|
|
spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end());
|
|
|
|
|
|
// 1. 获取bams数组覆盖的region范围
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零)
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识)
|
|
|
|
|
|
|
|
|
|
|
|
// 4. 遍历bams数组中的每一条记录并进行处理
|
|
|
|
|
|
|
|
|
|
|
|
readNumSum += readNum;
|
|
|
|
|
|
inBamBuf.ClearAll(); //
|
|
|
|
|
|
}
|
|
|
|
|
|
spdlog::info("read count: {}", readNumSum);
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-23 23:03:37 +08:00
|
|
|
|
// entrance of mark duplicates
|
2025-12-04 22:26:13 +08:00
|
|
|
|
int BaseRecalibrator() {
|
|
|
|
|
|
|
2025-11-23 23:03:37 +08:00
|
|
|
|
PROF_START(whole_process);
|
|
|
|
|
|
/* bam */
|
|
|
|
|
|
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
|
|
|
|
|
|
if (!nsgv::gInBamFp) {
|
|
|
|
|
|
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
2025-12-04 22:26:13 +08:00
|
|
|
|
|
2025-11-23 23:03:37 +08:00
|
|
|
|
// (libraryId)
|
|
|
|
|
|
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
|
|
|
|
|
|
|
2025-12-04 22:26:13 +08:00
|
|
|
|
/* 并行读取bam数据 */
|
2025-11-23 23:03:37 +08:00
|
|
|
|
htsThreadPool htsPoolRead = {NULL, 0}; // ,
|
|
|
|
|
|
htsThreadPool htsPoolWrite = {NULL, 0}; //
|
|
|
|
|
|
htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
|
|
|
|
|
|
htsPoolWrite.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
|
|
|
|
|
|
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
|
|
|
|
|
|
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
|
|
|
|
|
sam_close(nsgv::gInBamFp);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
|
|
|
|
|
|
2025-12-04 22:26:13 +08:00
|
|
|
|
return SerialBQSR();
|
2025-11-23 23:03:37 +08:00
|
|
|
|
|
2025-12-04 22:26:13 +08:00
|
|
|
|
// // 读取known sites vcfs
|
|
|
|
|
|
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
|
|
|
|
|
// spdlog::info(" {}", ks);
|
|
|
|
|
|
// bcf_srs_t* srs = bcf_sr_init();
|
|
|
|
|
|
// if (!bcf_sr_add_reader(srs, ks.c_str()))
|
|
|
|
|
|
// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
|
|
|
|
|
|
// nsgv::gKnownSitesVcfSrs.push_back(srs);
|
|
|
|
|
|
//
|
|
|
|
|
|
// while (bcf_sr_next_line(srs)) {
|
|
|
|
|
|
// bcf1_t* line = srs->readers[0].buffer[0];
|
|
|
|
|
|
// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
|
|
|
|
|
|
// }
|
|
|
|
|
|
// }
|
|
|
|
|
|
//
|
|
|
|
|
|
// /* 先实现串行的bqsr-phase-1 */
|
|
|
|
|
|
|
2025-11-23 23:03:37 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sam_close(nsgv::gInBamFp);
|
|
|
|
|
|
|
|
|
|
|
|
PROF_END(gprof[GP_whole_process], whole_process);
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|