336 lines
14 KiB
C++
336 lines
14 KiB
C++
// 一. 读取并解析recalibrate tables
|
||
|
||
// 二. 循环处理每一个read
|
||
|
||
// 1. 计算read的协变量covs
|
||
|
||
// 2. 根据read的协变量信息从recal tables获取对应的数据
|
||
|
||
// 3. 根据recal tables数据对read的每个碱基分别计算新的质量分数
|
||
|
||
// 4. 将计算后的质量分数赋值给read
|
||
|
||
#include <header.h> // in htslib
|
||
#include <htslib/sam.h>
|
||
#include <htslib/thread_pool.h>
|
||
|
||
#include "aux_arg.h"
|
||
#include "common_data.h"
|
||
#include "util/debug.h"
|
||
#include "util/profiling.h"
|
||
#include "recal_utils.h"
|
||
#include "recal_funcs.h"
|
||
#include "read_utils.h"
|
||
|
||
using std::vector;
|
||
|
||
namespace nsgv {
|
||
// 全局变量 for apply bqsr
|
||
samFile* gOutBamFp; // 输出文件, sam或者bam格式
|
||
sam_hdr_t* gOutBamHeader; // 输出文件的header
|
||
RecalTables gRecalTables; // 保留一个全局的recalibrate tables就行了
|
||
vector<uint8_t> gQuantizedQuals; // 读取quantized info table信息得到的,第三列数据
|
||
StableArray<bam1_t*> gRecalBams[2]; // 保存已经校正过质量分数的bam数据,双buffer
|
||
}; // namespace nsgv
|
||
|
||
// 串行apply bqsr
|
||
int SerialApplyBQSR(AuxVar &aux) {
|
||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::applyBqsrReadFilterOut);
|
||
int64_t readNumSum = 0;
|
||
int round = 0;
|
||
|
||
PerReadCovariateMatrix& readCovariates = aux.readCovariates;
|
||
// 一. 读取并解析recalibrate tables
|
||
auto& recalTables = *aux.bqsrTable;
|
||
|
||
// 全局的校正后的bam数组
|
||
auto& recalBams = nsgv::gRecalBams[0];
|
||
|
||
auto& sd = aux.sd;
|
||
|
||
while (true) {
|
||
++round;
|
||
// 一. 读取bam数据
|
||
size_t readNum = 0;
|
||
PROF_START(GP_read);
|
||
if (inBamBuf.ReadStat() >= 0)
|
||
readNum = inBamBuf.ReadBam();
|
||
PROF_GP_END(GP_read);
|
||
if (readNum < 1) {
|
||
break;
|
||
}
|
||
auto bams = inBamBuf.GetBamArr();
|
||
spdlog::info("{} reads processed in {} round", readNum, round);
|
||
|
||
if (recalBams.size() < bams.size()) {
|
||
int start = recalBams.size();
|
||
recalBams.resize(bams.size());
|
||
for (int i = start; i < recalBams.size(); ++i) {
|
||
recalBams[i] = bam_init1();
|
||
}
|
||
}
|
||
|
||
// 二. 遍历每个bam(read)记录,进行处理
|
||
for (int i = 0; i < bams.size(); ++i) {
|
||
if (bam_copy1(recalBams[i], bams[i]->b) == NULL) {
|
||
spdlog::error("Copy bam error");
|
||
exit(1);
|
||
}
|
||
BamWrap* bw = bams[i];
|
||
bw->b = recalBams[i]; // 注意这里的赋值,然后就可以对b进行更改了
|
||
sd.init();
|
||
sd.parseForApplyBQSR(bw);
|
||
sd.rid = i + aux.processedReads;
|
||
|
||
// 1. 是否使用original quality来代替当前的quality
|
||
if (nsgv::gBqsrArg.useOriginalBaseQualities)
|
||
ReadUtils::resetOriginalBaseQualities(bw->b);
|
||
|
||
// 2. 是否将当前的quality保存在tag OQ中
|
||
if (nsgv::gBqsrArg.emitOriginalQuals)
|
||
ReadUtils::setOriginalBaseQualsIfNoOQ(bw->b);
|
||
|
||
// 3. 计算read的协变量covs
|
||
PROF_START(GP_covariate);
|
||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
|
||
PROF_GP_END(GP_covariate);
|
||
|
||
// clear indel qualities
|
||
ReadUtils::removeAttribute(bw->b, "BI");
|
||
ReadUtils::removeAttribute(bw->b, "BD");
|
||
|
||
// 4. 检查read的readGroup tag是否包含在bqsr table里
|
||
const char* readGroupId = ReadUtils::getReadGroupId(bw->b);
|
||
auto& covaritesForRead = readCovariates[EventType::BASE_SUBSTITUTION.index];
|
||
uint8_t* recalibratedQuals = bam_get_qual(bw->b);
|
||
auto& preUpdateQuals = sd.base_quals;
|
||
int rgKey = -1;
|
||
if (ReadGroupCovariate::RgToId.find(std::string(readGroupId)) != ReadGroupCovariate::RgToId.end())
|
||
rgKey = ReadGroupCovariate::RgToId[std::string(readGroupId)];
|
||
if (rgKey == -1) {
|
||
if (nsgv::gBqsrArg.allowMissingReadGroups) {
|
||
// Given the way the recalibration code is implemented below, we cannot recalibrate a read with a
|
||
// read group that's not in the recal table.
|
||
for (int i = 0; i < sd.read_len; i++) {
|
||
//recalibratedQuals[i] = staticQuantizedMapping != null ? staticQuantizedMapping[preUpdateQuals[i]] : quantizedQuals.get(preUpdateQuals[i]);
|
||
recalibratedQuals[i] = nsgv::gQuantizedQuals[preUpdateQuals[i]];
|
||
}
|
||
} else {
|
||
spdlog::error(
|
||
"Read group {} not found in the recalibration table. Use \"--allow-missing-read-group\" command line argument to ignore this "
|
||
"error.",
|
||
readGroupId);
|
||
exit(1);
|
||
}
|
||
}
|
||
|
||
// 5. 根据recal tables数据对read的每个碱基分别计算新的质量分数
|
||
auto& readGroupDatum = recalTables.readGroupTable(EventType::BASE_SUBSTITUTION.index, rgKey);
|
||
// Note: this loop is under very heavy use in applyBQSR. Keep it slim.
|
||
for (int offset = 0; offset < sd.read_len; offset++) { // recalibrate all bases in the read
|
||
// only recalibrate usable qualities (default: >= 6) (the original quality will come from the instrument -- reported quality)
|
||
if (recalibratedQuals[offset] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN) {
|
||
continue;
|
||
}
|
||
auto& covs = readCovariates[EventType::BASE_SUBSTITUTION.index][offset];
|
||
// 根据read的协变量数据获取对应的bqsr数据
|
||
auto& qualityScoreDatum = recalTables.qualityScoreTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality);
|
||
auto& contextDatum = recalTables.contextTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.context);
|
||
auto& cycleDatum = recalTables.cycleTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.cycle);
|
||
// 计算校正后的质量分数
|
||
double priorQualityScore =
|
||
nsgv::gBqsrArg.globalQScorePrior > 0.0 ? nsgv::gBqsrArg.globalQScorePrior : readGroupDatum.getReportedQuality();
|
||
double rawRecalibratedQualityScore =
|
||
RecalFuncs::hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, contextDatum, cycleDatum);
|
||
uint8_t qualIdx = RecalFuncs::getBoundedIntegerQual(rawRecalibratedQualityScore);
|
||
uint8_t quantizedQualityScore = qualIdx; // nsgv::gQuantizedQuals.at(qualIdx);
|
||
// TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be
|
||
// quantized once.
|
||
// recalibratedQuals[offset] = staticQuantizedMapping == null ? quantizedQualityScore : staticQuantizedMapping[quantizedQualityScore];
|
||
recalibratedQuals[offset] = quantizedQualityScore;
|
||
}
|
||
|
||
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
||
spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str());
|
||
sam_close(nsgv::gInBamFp);
|
||
sam_close(nsgv::gOutBamFp);
|
||
exit(1);
|
||
}
|
||
//break;
|
||
}
|
||
// break;
|
||
|
||
readNumSum += readNum;
|
||
AuxVar::processedReads += readNum;
|
||
inBamBuf.ClearAll(); // 清空bam buf
|
||
}
|
||
|
||
spdlog::info("read count: {}", readNumSum);
|
||
|
||
return 0;
|
||
}
|
||
|
||
// 并行 apply bqsr
|
||
int ParallelApplyBQSR(vector<AuxVar> &auxArr) {
|
||
|
||
return 0;
|
||
}
|
||
|
||
// 在进行数据处理之前,初始化一些全局数据
|
||
static void globalInit() {
|
||
open_debug_files();
|
||
/* bam */
|
||
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
|
||
if (!nsgv::gInBamFp) {
|
||
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
|
||
exit(1);
|
||
}
|
||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
||
|
||
/* 并行读取bam数据 */
|
||
htsThreadPool htsPoolRead = {NULL, 0}; //
|
||
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
|
||
htsPoolRead.pool = hts_tpool_init(readThreadNum);
|
||
if (!htsPoolRead.pool) {
|
||
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||
sam_close(nsgv::gInBamFp);
|
||
exit(1);
|
||
}
|
||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||
|
||
if (!nsgv::gInBamHeader->hrecs) {
|
||
if (sam_hdr_fill_hrecs(nsgv::gInBamHeader) != 0) {
|
||
spdlog::error("[{}] failed to read sam header", __LINE__);
|
||
sam_close(nsgv::gInBamFp);
|
||
exit(1);
|
||
}
|
||
}
|
||
/* 并行写入bam文件 */
|
||
char modeout[12] = "wb";
|
||
sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL);
|
||
nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout);
|
||
if (!nsgv::gOutBamFp) {
|
||
spdlog::error("[{}] create output sam/bam file failed.\n", __func__);
|
||
exit(1);
|
||
}
|
||
nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader);
|
||
// 修改输出文件的header
|
||
sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", PROGRAM_NAME, "VN", FASTBQSR_VERSION, "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL);
|
||
// 用不同的线程数量处理输出文件
|
||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||
int writeThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_WRITE_MAX_THREAD);
|
||
htsPoolWrite.pool = hts_tpool_init(writeThreadNum);
|
||
if (!htsPoolWrite.pool) {
|
||
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||
sam_close(nsgv::gInBamFp);
|
||
exit(1);
|
||
}
|
||
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||
|
||
if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) {
|
||
spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE);
|
||
sam_close(nsgv::gOutBamFp);
|
||
sam_close(nsgv::gInBamFp);
|
||
exit(1);
|
||
}
|
||
|
||
// 输出index文件
|
||
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 是bai格式
|
||
if ("sam" == Utils::getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) {
|
||
nsgv::gBqsrArg.INDEX_FILE = "";
|
||
}
|
||
if (!nsgv::gBqsrArg.INDEX_FILE.empty()) {
|
||
int index_min_shift = 0;
|
||
if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) {
|
||
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".csi";
|
||
index_min_shift = 14;
|
||
}
|
||
if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, nsgv::gBqsrArg.INDEX_FILE.c_str()) < 0) {
|
||
spdlog::error("failed to open index \"{}\" for writing", nsgv::gBqsrArg.INDEX_FILE);
|
||
sam_close(nsgv::gOutBamFp);
|
||
sam_close(nsgv::gInBamFp);
|
||
exit(1);
|
||
}
|
||
}
|
||
|
||
// 1. 协变量数据相关初始化
|
||
ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
|
||
CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
|
||
|
||
// 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue
|
||
// nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg);
|
||
|
||
// 初始化AuxVar
|
||
nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
|
||
for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
|
||
nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
|
||
nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
|
||
if (nsgv::gAuxVars[i].faidx == 0)
|
||
error("[%s] fail to load the fasta index.\n", __func__);
|
||
// 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue
|
||
nsgv::gAuxVars[i].bqsrTable = &nsgv::gRecalTables;
|
||
CovariateUtils::InitPerReadCovMat(nsgv::gAuxVars[i].readCovariates);
|
||
}
|
||
|
||
// 0. 初始化一些全局数据
|
||
RecalDatum::StaticInit();
|
||
QualityUtils::StaticInit();
|
||
MathUtils::StaticInit();
|
||
BaseUtils::StaticInit();
|
||
|
||
// 初始化需要计算的event types
|
||
RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables);
|
||
|
||
// 2. 读取bam的read group
|
||
// if (nsgv::gInBamHeader->hrecs->nrg == 0) {
|
||
// spdlog::error("No RG tag found in the header!");
|
||
// exit(1);
|
||
// }
|
||
|
||
/*
|
||
// 应该是从bqsr table里读取read group信息
|
||
for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
|
||
// spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
|
||
ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
|
||
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
|
||
}
|
||
*/
|
||
|
||
// 读取并解析recalibrate tables
|
||
RecalUtils::readRecalTables(nsgv::gBqsrArg.BQSR_FILE, nsgv::gBqsrArg, nsgv::gQuantizedQuals, nsgv::gRecalTables);
|
||
}
|
||
|
||
// 全局资源释放
|
||
static void globalDestroy() {
|
||
close_debug_files();
|
||
|
||
if (!nsgv::gBqsrArg.INDEX_FILE.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) {
|
||
spdlog::error("writing index failed");
|
||
sam_close(nsgv::gOutBamFp);
|
||
sam_close(nsgv::gInBamFp);
|
||
exit(1);
|
||
}
|
||
|
||
/* 关闭文件,收尾清理 */
|
||
sam_close(nsgv::gOutBamFp);
|
||
sam_close(nsgv::gInBamFp);
|
||
}
|
||
|
||
// entrance of BQSR phase-1
|
||
int ApplyBQSR() {
|
||
int ret = 0;
|
||
|
||
PROF_START(GP_whole_process);
|
||
globalInit();
|
||
// if (nsgv::gBqsrArg.NUM_THREADS == 1)
|
||
ret = SerialApplyBQSR(nsgv::gAuxVars[0]); // 串行处理数据,生成recal bams
|
||
// else
|
||
// ret = ParallelApplyBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal bams
|
||
globalDestroy();
|
||
PROF_GP_END(GP_whole_process);
|
||
|
||
return ret;
|
||
} |