2024-12-14 12:24:19 +08:00
|
|
|
|
/*
|
|
|
|
|
|
Description:
|
|
|
|
|
|
标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/10/23
|
|
|
|
|
|
*/
|
2024-12-15 03:20:35 +08:00
|
|
|
|
#include <htslib/sam.h>
|
|
|
|
|
|
#include <htslib/thread_pool.h>
|
|
|
|
|
|
#include <spdlog/spdlog.h>
|
2024-12-14 12:24:19 +08:00
|
|
|
|
|
2024-12-15 03:20:35 +08:00
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
|
|
|
|
#include "dup_metrics.h"
|
|
|
|
|
|
#include "md_args.h"
|
|
|
|
|
|
#include "md_funcs.h"
|
|
|
|
|
|
#include "pipeline_md.h"
|
|
|
|
|
|
#include "read_name_parser.h"
|
|
|
|
|
|
#include "util/profiling.h"
|
|
|
|
|
|
#include "version.h"
|
|
|
|
|
|
|
|
|
|
|
|
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
|
|
|
|
|
|
|
|
|
|
|
namespace nsgv {
|
|
|
|
|
|
|
|
|
|
|
|
MarkDupsArg gMdArg; // 用来测试性能
|
|
|
|
|
|
std::vector<ReadNameParser> gNameParsers; // 每个线程一个read name parser
|
|
|
|
|
|
samFile *gInBamFp; // 输入的bam文件
|
|
|
|
|
|
sam_hdr_t *gInBamHeader; // 输入的bam文件头信息
|
|
|
|
|
|
samFile *gOutBamFp; // 输出文件, sam或者bam格式
|
|
|
|
|
|
sam_hdr_t *gOutBamHeader; // 输出文件的header
|
|
|
|
|
|
DuplicationMetrics gMetrics; // 统计信息
|
|
|
|
|
|
PipelineArg gPipe;
|
|
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* 获取文件名后缀
|
|
|
|
|
|
*/
|
|
|
|
|
|
static string getFileExtension(const string &filename) {
|
|
|
|
|
|
auto last_dot = filename.find_last_of('.');
|
|
|
|
|
|
if (last_dot == string::npos) {
|
|
|
|
|
|
return "";
|
|
|
|
|
|
}
|
|
|
|
|
|
return filename.substr(last_dot + 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// entrance of mark duplicates
|
|
|
|
|
|
int MarkDuplicates() {
|
|
|
|
|
|
|
|
|
|
|
|
/* 初始化一些参数和变量*/
|
|
|
|
|
|
nsgv::gNameParsers.resize(nsgv::gMdArg.NUM_THREADS);
|
|
|
|
|
|
for (auto &parser : nsgv::gNameParsers)
|
|
|
|
|
|
parser.SetReadNameRegex(nsgv::gMdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息
|
|
|
|
|
|
|
|
|
|
|
|
/* 打开输入bam文件 */
|
|
|
|
|
|
nsgv::gInBamFp = sam_open_format(nsgv::gMdArg.INPUT_FILE.c_str(), "r", nullptr);
|
|
|
|
|
|
if (!nsgv::gInBamFp) {
|
|
|
|
|
|
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // 读取header
|
|
|
|
|
|
// 获取样本名称(libraryId)
|
|
|
|
|
|
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
|
|
|
|
|
|
|
|
|
|
|
|
/* 利用线程池对输入输出文件进行读写 */
|
|
|
|
|
|
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
|
|
|
|
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
|
|
|
|
|
htsPoolRead.pool = hts_tpool_init(nsgv::gMdArg.NUM_THREADS);
|
|
|
|
|
|
htsPoolWrite.pool = hts_tpool_init(nsgv::gMdArg.NUM_THREADS);
|
|
|
|
|
|
// htsPoolRead.pool = hts_tpool_init(8);
|
|
|
|
|
|
// htsPoolWrite.pool = hts_tpool_init(32);
|
|
|
|
|
|
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
|
|
|
|
|
|
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
|
|
|
|
|
sam_close(nsgv::gInBamFp);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
|
|
|
|
|
|
|
|
|
|
|
// 测试读写速度
|
|
|
|
|
|
#if 1
|
|
|
|
|
|
bam1_t *bamp = bam_init1();
|
|
|
|
|
|
while (sam_read1(nsgv::gInBamFp, nsgv::gInBamHeader, bamp) >= 0);
|
|
|
|
|
|
DisplayProfiling(nsgv::gMdArg.NUM_THREADS);
|
|
|
|
|
|
exit(0);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|