2023-10-23 23:07:00 +08:00
|
|
|
|
/*
|
2023-11-09 21:07:58 +08:00
|
|
|
|
Description: 标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/10/23
|
|
|
|
|
|
*/
|
2023-11-01 10:48:02 +08:00
|
|
|
|
#include "markdups_arg.h"
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// 有太多define冲突,放到最后include
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
#include <common/hts/bam_buf.h>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#include <common/utils/global_arg.h>
|
|
|
|
|
|
#include <common/utils/thpool.h>
|
|
|
|
|
|
#include <common/utils/timer.h>
|
|
|
|
|
|
#include <common/utils/util.h>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <common/utils/murmur3.h>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#include <common/utils/yarn.h>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <sam/utils/read_ends.h>
|
|
|
|
|
|
#include <sam/utils/read_name_parser.h>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
|
|
|
|
|
#include <htslib/sam.h>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <htslib/thread_pool.h>
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
|
|
|
|
|
#include <iostream>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#include <vector>
|
|
|
|
|
|
#include <set>
|
|
|
|
|
|
#include <queue>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <unordered_map>
|
2023-11-28 10:45:40 +08:00
|
|
|
|
#include <unordered_set>
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
|
|
|
|
|
using namespace std;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
using std::cout;
|
|
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#define SMA_TAG_PG "PG"
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#define NO_SUCH_INDEX INT64_MAX
|
|
|
|
|
|
|
2023-12-08 03:57:30 +08:00
|
|
|
|
static Timer tm_arr[20]; // 用来测试性能
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 全局本地变量 */
|
|
|
|
|
|
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
|
|
|
|
|
static samFile *g_inBamFp; // 输入的bam文件
|
|
|
|
|
|
static sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
|
|
|
|
|
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
|
|
|
|
|
static sam_hdr_t *g_outBamHeader; // 输出文件的header
|
|
|
|
|
|
|
|
|
|
|
|
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
|
|
|
|
|
static GlobalArg &g_gArg = GlobalArg::Instance();
|
|
|
|
|
|
static MarkDupsArg g_mdArg;
|
|
|
|
|
|
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
#include "md_funcs.h"
|
|
|
|
|
|
#include "serial_md.h"
|
|
|
|
|
|
#include "parallel_md.h"
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
|
|
|
|
|
*/
|
|
|
|
|
|
int MarkDuplicates(int argc, char *argv[])
|
|
|
|
|
|
{
|
|
|
|
|
|
Timer::log_time("程序开始");
|
|
|
|
|
|
Timer time_all;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
/* 读取命令行参数 */
|
|
|
|
|
|
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
|
|
|
|
|
if (g_gArg.num_threads < 1) // 线程数不能小于1
|
|
|
|
|
|
g_gArg.num_threads = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/* 初始化一些参数和变量*/
|
|
|
|
|
|
g_vRnParser.resize(g_gArg.num_threads);
|
|
|
|
|
|
for (auto &parser : g_vRnParser)
|
|
|
|
|
|
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息
|
|
|
|
|
|
|
|
|
|
|
|
/* 打开输入bam文件 */
|
2023-11-28 10:45:40 +08:00
|
|
|
|
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
|
|
|
|
|
if (!g_inBamFp)
|
2023-11-09 21:07:58 +08:00
|
|
|
|
{
|
|
|
|
|
|
Error("[%s] load sam/bam file failed.\n", __func__);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
/* 利用线程池对输入输出文件进行读写 */
|
|
|
|
|
|
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
|
|
|
|
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
2023-11-28 10:45:40 +08:00
|
|
|
|
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
|
|
|
|
|
htsPoolRead.pool = hts_tpool_init(16);
|
2023-12-08 03:57:30 +08:00
|
|
|
|
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
|
|
|
|
|
htsPoolWrite.pool = hts_tpool_init(16);
|
2023-11-09 21:07:58 +08:00
|
|
|
|
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("[%d] failed to set up thread pool", __LINE__);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
sam_close(g_inBamFp);
|
2023-11-09 21:07:58 +08:00
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
/* 初始化输出文件 */
|
|
|
|
|
|
char modeout[12] = "wb";
|
|
|
|
|
|
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
|
|
|
|
|
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
g_outBamHeader = sam_hdr_dup(g_inBamHeader);
|
2023-11-09 21:07:58 +08:00
|
|
|
|
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
|
|
|
|
|
|
|
|
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
/* 冗余检查和标记 */
|
|
|
|
|
|
if (g_gArg.num_threads == 1)
|
2023-11-09 21:07:58 +08:00
|
|
|
|
{
|
2023-11-28 10:45:40 +08:00
|
|
|
|
serialMarkDups(); // 串行运行
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
parallelMarkDups(); // 并行运行
|
|
|
|
|
|
}
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
/* 标记冗余, 将处理后的结果写入文件 */
|
|
|
|
|
|
sam_close(g_inBamFp); // 重新打开bam文件
|
|
|
|
|
|
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
|
|
|
|
|
if (!g_inBamFp)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("[%s] load sam/bam file failed.\n", __func__);
|
|
|
|
|
|
return -1;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
|
|
|
|
|
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
|
|
|
|
|
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
|
2023-11-09 21:07:58 +08:00
|
|
|
|
{
|
2023-11-28 10:45:40 +08:00
|
|
|
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
|
|
|
|
|
sam_close(g_outBamFp);
|
|
|
|
|
|
sam_close(g_inBamFp);
|
|
|
|
|
|
return -1;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
// 输出index文件
|
|
|
|
|
|
string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
|
|
|
|
|
|
if (sam_idx_init(g_outBamFp, g_outBamHeader, 14 /*csi*/, indexFn.c_str()) < 0)
|
2023-11-09 21:07:58 +08:00
|
|
|
|
{
|
2023-11-28 10:45:40 +08:00
|
|
|
|
Error("failed to open index \"%s\" for writing", indexFn.c_str());
|
|
|
|
|
|
sam_close(g_outBamFp);
|
|
|
|
|
|
sam_close(g_inBamFp);
|
|
|
|
|
|
return -1;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
// 读取输入文件
|
|
|
|
|
|
// BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
|
|
|
|
|
|
BamBufType inBuf(g_gArg.use_asyncio);
|
|
|
|
|
|
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
2023-12-08 03:57:30 +08:00
|
|
|
|
Timer tw;
|
|
|
|
|
|
while (inBuf.ReadStat() >= 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
Timer tw1;
|
|
|
|
|
|
size_t readNum = inBuf.ReadBam();
|
|
|
|
|
|
cout << "read: " << readNum << endl;
|
|
|
|
|
|
for (size_t i = 0; i < inBuf.Size(); ++i)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* 判断是否冗余 */
|
|
|
|
|
|
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
|
|
|
|
|
sam_close(g_outBamFp);
|
|
|
|
|
|
sam_close(g_inBamFp);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
inBuf.ClearAll();
|
|
|
|
|
|
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (sam_idx_save(g_outBamFp) < 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("writing index failed");
|
|
|
|
|
|
sam_close(g_outBamFp);
|
|
|
|
|
|
sam_close(g_inBamFp);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
/* 关闭文件,收尾清理 */
|
|
|
|
|
|
sam_close(g_outBamFp);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
sam_close(g_inBamFp);
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
Timer::log_time("程序结束");
|
2023-10-23 23:07:00 +08:00
|
|
|
|
return 0;
|
|
|
|
|
|
}
|