/* Description: bam,bam,bam Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/23 */ #include #include #include #include #include #include #include "bqsr_args.h" #include "bqsr_funcs.h" #include "bqsr_pipeline.h" #include "dup_metrics.h" #include "fastbqsr_version.h" #include "read_name_parser.h" #include "util/profiling.h" #include "util/utils.h" #define BAM_BLOCK_SIZE 16L * 1024 * 1024 namespace nsgv { std::vector gNameParsers; // read name parser DuplicationMetrics gMetrics; // DupResult gDupRes; PipelineArg gPipe(&gDupRes); BQSRArg gBqsrArg; // samFile *gInBamFp; // bam sam_hdr_t *gInBamHeader; // bam samFile *gOutBamFp; // , sambam sam_hdr_t *gOutBamHeader; // header vector gKnownSitesVcfSrs; // known sites vcf srs }; // namespace nsgv // struct ByteBuf { uint8_t *buf = nullptr; int size = 0; // int capacity = 0; // }; /* * */ static string getFileExtension(const string &filename) { auto last_dot = filename.find_last_of('.'); if (last_dot == string::npos) { return ""; } return filename.substr(last_dot + 1); } // 串行bqsr int SerialBQSR() { int round = 0; BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM); int64_t readNumSum = 0; while (1) { ++ round; size_t readNum = 0; if (inBamBuf.ReadStat() >= 0) readNum = inBamBuf.ReadBam(); if (readNum < 1) { break; } spdlog::info("{} reads processed in {} round", readNum, round); auto bams = inBamBuf.GetBamArr(); spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end()); // 1. 获取bams数组覆盖的region范围 // 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零) // 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识) // 4. 遍历bams数组中的每一条记录并进行处理 readNumSum += readNum; inBamBuf.ClearAll(); // } spdlog::info("read count: {}", readNumSum); return 0; } // entrance of mark duplicates int BaseRecalibrator() { PROF_START(whole_process); /* bam */ nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr); if (!nsgv::gInBamFp) { spdlog::error("[{}] load sam/bam file failed.\n", __func__); return -1; } hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header // (libraryId) nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0); /* 并行读取bam数据 */ htsThreadPool htsPoolRead = {NULL, 0}; // , htsThreadPool htsPoolWrite = {NULL, 0}; // htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS); htsPoolWrite.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS); if (!htsPoolRead.pool || !htsPoolWrite.pool) { spdlog::error("[{}] failed to set up thread pool", __LINE__); sam_close(nsgv::gInBamFp); return -1; } hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); return SerialBQSR(); // // 读取known sites vcfs // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { // spdlog::info(" {}", ks); // bcf_srs_t* srs = bcf_sr_init(); // if (!bcf_sr_add_reader(srs, ks.c_str())) // error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum)); // nsgv::gKnownSitesVcfSrs.push_back(srs); // // while (bcf_sr_next_line(srs)) { // bcf1_t* line = srs->readers[0].buffer[0]; // cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl; // } // } // // /* 先实现串行的bqsr-phase-1 */ sam_close(nsgv::gInBamFp); PROF_END(gprof[GP_whole_process], whole_process); return 0; }