实现了apply bqsr的单线程版本,结果还有点错误,继续调试
This commit is contained in:
parent
985875ebac
commit
94e06338cd
|
|
@ -6,5 +6,5 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
# set(CMAKE_BUILD_TYPE Debug)
|
# set(CMAKE_BUILD_TYPE Debug)
|
||||||
# set(CMAKE_BUILD_TYPE Release)
|
# set(CMAKE_BUILD_TYPE Release)
|
||||||
add_definitions(-DSHOW_PERF)
|
add_definitions(-DSHOW_PERF)
|
||||||
#add_definitions(-DSHOW_PERF=1)
|
# add_definitions(-DSHOW_PERF=1)
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
|
||||||
|
|
@ -2,4 +2,6 @@
|
||||||
|
|
||||||
对GATK bqsr部分进行并行优化
|
对GATK bqsr部分进行并行优化
|
||||||
|
|
||||||
|
对应的GATK版本:The Genome Analysis Toolkit (GATK) v4.6.2.0-17-g2a1f41b-SNAPSHOT
|
||||||
|
|
||||||
Implement the same functions as GATK for BaseRecalibrator and ApplyBQSR
|
Implement the same functions as GATK for BaseRecalibrator and ApplyBQSR
|
||||||
|
|
@ -0,0 +1,336 @@
|
||||||
|
// 一. 读取并解析recalibrate tables
|
||||||
|
|
||||||
|
// 二. 循环处理每一个read
|
||||||
|
|
||||||
|
// 1. 计算read的协变量covs
|
||||||
|
|
||||||
|
// 2. 根据read的协变量信息从recal tables获取对应的数据
|
||||||
|
|
||||||
|
// 3. 根据recal tables数据对read的每个碱基分别计算新的质量分数
|
||||||
|
|
||||||
|
// 4. 将计算后的质量分数赋值给read
|
||||||
|
|
||||||
|
#include <header.h> // in htslib
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
#include <htslib/thread_pool.h>
|
||||||
|
|
||||||
|
#include "aux_arg.h"
|
||||||
|
#include "common_data.h"
|
||||||
|
#include "util/debug.h"
|
||||||
|
#include "util/profiling.h"
|
||||||
|
#include "recal_utils.h"
|
||||||
|
#include "recal_funcs.h"
|
||||||
|
#include "read_utils.h"
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
namespace nsgv {
|
||||||
|
// 全局变量 for apply bqsr
|
||||||
|
samFile* gOutBamFp; // 输出文件, sam或者bam格式
|
||||||
|
sam_hdr_t* gOutBamHeader; // 输出文件的header
|
||||||
|
RecalTables gRecalTables; // 保留一个全局的recalibrate tables就行了
|
||||||
|
vector<uint8_t> gQuantizedQuals; // 读取quantized info table信息得到的,第三列数据
|
||||||
|
StableArray<bam1_t*> gRecalBams[2]; // 保存已经校正过质量分数的bam数据,双buffer
|
||||||
|
}; // namespace nsgv
|
||||||
|
|
||||||
|
// 串行apply bqsr
|
||||||
|
int SerialApplyBQSR(AuxVar &aux) {
|
||||||
|
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||||
|
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::applyBqsrReadFilterOut);
|
||||||
|
int64_t readNumSum = 0;
|
||||||
|
int round = 0;
|
||||||
|
|
||||||
|
PerReadCovariateMatrix& readCovariates = aux.readCovariates;
|
||||||
|
// 一. 读取并解析recalibrate tables
|
||||||
|
auto& recalTables = *aux.bqsrTable;
|
||||||
|
|
||||||
|
// 全局的校正后的bam数组
|
||||||
|
auto& recalBams = nsgv::gRecalBams[0];
|
||||||
|
|
||||||
|
auto& sd = aux.sd;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
++round;
|
||||||
|
// 一. 读取bam数据
|
||||||
|
size_t readNum = 0;
|
||||||
|
PROF_START(GP_read);
|
||||||
|
if (inBamBuf.ReadStat() >= 0)
|
||||||
|
readNum = inBamBuf.ReadBam();
|
||||||
|
PROF_GP_END(GP_read);
|
||||||
|
if (readNum < 1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto bams = inBamBuf.GetBamArr();
|
||||||
|
spdlog::info("{} reads processed in {} round", readNum, round);
|
||||||
|
|
||||||
|
if (recalBams.size() < bams.size()) {
|
||||||
|
int start = recalBams.size();
|
||||||
|
recalBams.resize(bams.size());
|
||||||
|
for (int i = start; i < recalBams.size(); ++i) {
|
||||||
|
recalBams[i] = bam_init1();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 二. 遍历每个bam(read)记录,进行处理
|
||||||
|
for (int i = 0; i < bams.size(); ++i) {
|
||||||
|
if (bam_copy1(recalBams[i], bams[i]->b) == NULL) {
|
||||||
|
spdlog::error("Copy bam error");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
BamWrap* bw = bams[i];
|
||||||
|
bw->b = recalBams[i]; // 注意这里的赋值,然后就可以对b进行更改了
|
||||||
|
sd.init();
|
||||||
|
sd.parseForApplyBQSR(bw);
|
||||||
|
sd.rid = i + aux.processedReads;
|
||||||
|
|
||||||
|
// 1. 是否使用original quality来代替当前的quality
|
||||||
|
if (nsgv::gBqsrArg.useOriginalBaseQualities)
|
||||||
|
ReadUtils::resetOriginalBaseQualities(bw->b);
|
||||||
|
|
||||||
|
// 2. 是否将当前的quality保存在tag OQ中
|
||||||
|
if (nsgv::gBqsrArg.emitOriginalQuals)
|
||||||
|
ReadUtils::setOriginalBaseQualsIfNoOQ(bw->b);
|
||||||
|
|
||||||
|
// 3. 计算read的协变量covs
|
||||||
|
PROF_START(GP_covariate);
|
||||||
|
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
|
||||||
|
PROF_GP_END(GP_covariate);
|
||||||
|
|
||||||
|
// clear indel qualities
|
||||||
|
ReadUtils::removeAttribute(bw->b, "BI");
|
||||||
|
ReadUtils::removeAttribute(bw->b, "BD");
|
||||||
|
|
||||||
|
// 4. 检查read的readGroup tag是否包含在bqsr table里
|
||||||
|
const char* readGroupId = ReadUtils::getReadGroupId(bw->b);
|
||||||
|
auto& covaritesForRead = readCovariates[EventType::BASE_SUBSTITUTION.index];
|
||||||
|
uint8_t* recalibratedQuals = bam_get_qual(bw->b);
|
||||||
|
auto& preUpdateQuals = sd.base_quals;
|
||||||
|
int rgKey = -1;
|
||||||
|
if (ReadGroupCovariate::RgToId.find(std::string(readGroupId)) != ReadGroupCovariate::RgToId.end())
|
||||||
|
rgKey = ReadGroupCovariate::RgToId[std::string(readGroupId)];
|
||||||
|
if (rgKey == -1) {
|
||||||
|
if (nsgv::gBqsrArg.allowMissingReadGroups) {
|
||||||
|
// Given the way the recalibration code is implemented below, we cannot recalibrate a read with a
|
||||||
|
// read group that's not in the recal table.
|
||||||
|
for (int i = 0; i < sd.read_len; i++) {
|
||||||
|
//recalibratedQuals[i] = staticQuantizedMapping != null ? staticQuantizedMapping[preUpdateQuals[i]] : quantizedQuals.get(preUpdateQuals[i]);
|
||||||
|
recalibratedQuals[i] = nsgv::gQuantizedQuals[preUpdateQuals[i]];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
spdlog::error(
|
||||||
|
"Read group {} not found in the recalibration table. Use \"--allow-missing-read-group\" command line argument to ignore this "
|
||||||
|
"error.",
|
||||||
|
readGroupId);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. 根据recal tables数据对read的每个碱基分别计算新的质量分数
|
||||||
|
auto& readGroupDatum = recalTables.readGroupTable(EventType::BASE_SUBSTITUTION.index, rgKey);
|
||||||
|
// Note: this loop is under very heavy use in applyBQSR. Keep it slim.
|
||||||
|
for (int offset = 0; offset < sd.read_len; offset++) { // recalibrate all bases in the read
|
||||||
|
// only recalibrate usable qualities (default: >= 6) (the original quality will come from the instrument -- reported quality)
|
||||||
|
if (recalibratedQuals[offset] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto& covs = readCovariates[EventType::BASE_SUBSTITUTION.index][offset];
|
||||||
|
// 根据read的协变量数据获取对应的bqsr数据
|
||||||
|
auto& qualityScoreDatum = recalTables.qualityScoreTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality);
|
||||||
|
auto& contextDatum = recalTables.contextTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.context);
|
||||||
|
auto& cycleDatum = recalTables.cycleTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.cycle);
|
||||||
|
// 计算校正后的质量分数
|
||||||
|
double priorQualityScore =
|
||||||
|
nsgv::gBqsrArg.globalQScorePrior > 0.0 ? nsgv::gBqsrArg.globalQScorePrior : readGroupDatum.getReportedQuality();
|
||||||
|
double rawRecalibratedQualityScore =
|
||||||
|
RecalFuncs::hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, contextDatum, cycleDatum);
|
||||||
|
uint8_t qualIdx = RecalFuncs::getBoundedIntegerQual(rawRecalibratedQualityScore);
|
||||||
|
uint8_t quantizedQualityScore = qualIdx; // nsgv::gQuantizedQuals.at(qualIdx);
|
||||||
|
// TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be
|
||||||
|
// quantized once.
|
||||||
|
// recalibratedQuals[offset] = staticQuantizedMapping == null ? quantizedQualityScore : staticQuantizedMapping[quantizedQualityScore];
|
||||||
|
recalibratedQuals[offset] = quantizedQualityScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
||||||
|
spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str());
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
sam_close(nsgv::gOutBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
//break;
|
||||||
|
}
|
||||||
|
// break;
|
||||||
|
|
||||||
|
readNumSum += readNum;
|
||||||
|
AuxVar::processedReads += readNum;
|
||||||
|
inBamBuf.ClearAll(); // 清空bam buf
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("read count: {}", readNumSum);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 并行 apply bqsr
|
||||||
|
int ParallelApplyBQSR(vector<AuxVar> &auxArr) {
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 在进行数据处理之前,初始化一些全局数据
|
||||||
|
static void globalInit() {
|
||||||
|
open_debug_files();
|
||||||
|
/* bam */
|
||||||
|
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
|
||||||
|
if (!nsgv::gInBamFp) {
|
||||||
|
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
|
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
||||||
|
|
||||||
|
/* 并行读取bam数据 */
|
||||||
|
htsThreadPool htsPoolRead = {NULL, 0}; //
|
||||||
|
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
|
||||||
|
htsPoolRead.pool = hts_tpool_init(readThreadNum);
|
||||||
|
if (!htsPoolRead.pool) {
|
||||||
|
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||||
|
|
||||||
|
if (!nsgv::gInBamHeader->hrecs) {
|
||||||
|
if (sam_hdr_fill_hrecs(nsgv::gInBamHeader) != 0) {
|
||||||
|
spdlog::error("[{}] failed to read sam header", __LINE__);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* 并行写入bam文件 */
|
||||||
|
char modeout[12] = "wb";
|
||||||
|
sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL);
|
||||||
|
nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout);
|
||||||
|
if (!nsgv::gOutBamFp) {
|
||||||
|
spdlog::error("[{}] create output sam/bam file failed.\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader);
|
||||||
|
// 修改输出文件的header
|
||||||
|
sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", PROGRAM_NAME, "VN", FASTBQSR_VERSION, "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL);
|
||||||
|
// 用不同的线程数量处理输出文件
|
||||||
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||||
|
int writeThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_WRITE_MAX_THREAD);
|
||||||
|
htsPoolWrite.pool = hts_tpool_init(writeThreadNum);
|
||||||
|
if (!htsPoolWrite.pool) {
|
||||||
|
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
|
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||||
|
|
||||||
|
if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) {
|
||||||
|
spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE);
|
||||||
|
sam_close(nsgv::gOutBamFp);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 输出index文件
|
||||||
|
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 是bai格式
|
||||||
|
if ("sam" == Utils::getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) {
|
||||||
|
nsgv::gBqsrArg.INDEX_FILE = "";
|
||||||
|
}
|
||||||
|
if (!nsgv::gBqsrArg.INDEX_FILE.empty()) {
|
||||||
|
int index_min_shift = 0;
|
||||||
|
if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) {
|
||||||
|
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".csi";
|
||||||
|
index_min_shift = 14;
|
||||||
|
}
|
||||||
|
if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, nsgv::gBqsrArg.INDEX_FILE.c_str()) < 0) {
|
||||||
|
spdlog::error("failed to open index \"{}\" for writing", nsgv::gBqsrArg.INDEX_FILE);
|
||||||
|
sam_close(nsgv::gOutBamFp);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1. 协变量数据相关初始化
|
||||||
|
ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
|
||||||
|
CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
|
||||||
|
|
||||||
|
// 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue
|
||||||
|
// nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg);
|
||||||
|
|
||||||
|
// 初始化AuxVar
|
||||||
|
nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
|
||||||
|
for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
|
||||||
|
nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
|
||||||
|
nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
|
||||||
|
if (nsgv::gAuxVars[i].faidx == 0)
|
||||||
|
error("[%s] fail to load the fasta index.\n", __func__);
|
||||||
|
// 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue
|
||||||
|
nsgv::gAuxVars[i].bqsrTable = &nsgv::gRecalTables;
|
||||||
|
CovariateUtils::InitPerReadCovMat(nsgv::gAuxVars[i].readCovariates);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 0. 初始化一些全局数据
|
||||||
|
RecalDatum::StaticInit();
|
||||||
|
QualityUtils::StaticInit();
|
||||||
|
MathUtils::StaticInit();
|
||||||
|
BaseUtils::StaticInit();
|
||||||
|
|
||||||
|
// 初始化需要计算的event types
|
||||||
|
RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables);
|
||||||
|
|
||||||
|
// 2. 读取bam的read group
|
||||||
|
// if (nsgv::gInBamHeader->hrecs->nrg == 0) {
|
||||||
|
// spdlog::error("No RG tag found in the header!");
|
||||||
|
// exit(1);
|
||||||
|
// }
|
||||||
|
|
||||||
|
/*
|
||||||
|
// 应该是从bqsr table里读取read group信息
|
||||||
|
for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
|
||||||
|
// spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
|
||||||
|
ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
|
||||||
|
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// 读取并解析recalibrate tables
|
||||||
|
RecalUtils::readRecalTables(nsgv::gBqsrArg.BQSR_FILE, nsgv::gBqsrArg, nsgv::gQuantizedQuals, nsgv::gRecalTables);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 全局资源释放
|
||||||
|
static void globalDestroy() {
|
||||||
|
close_debug_files();
|
||||||
|
|
||||||
|
if (!nsgv::gBqsrArg.INDEX_FILE.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) {
|
||||||
|
spdlog::error("writing index failed");
|
||||||
|
sam_close(nsgv::gOutBamFp);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 关闭文件,收尾清理 */
|
||||||
|
sam_close(nsgv::gOutBamFp);
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// entrance of BQSR phase-1
|
||||||
|
int ApplyBQSR() {
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
PROF_START(GP_whole_process);
|
||||||
|
globalInit();
|
||||||
|
// if (nsgv::gBqsrArg.NUM_THREADS == 1)
|
||||||
|
ret = SerialApplyBQSR(nsgv::gAuxVars[0]); // 串行处理数据,生成recal bams
|
||||||
|
// else
|
||||||
|
// ret = ParallelApplyBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal bams
|
||||||
|
globalDestroy();
|
||||||
|
PROF_GP_END(GP_whole_process);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
@ -43,11 +43,14 @@ struct AuxVar {
|
||||||
vector<VCFParser> vcfArr; // 从vcf中获取已知位点
|
vector<VCFParser> vcfArr; // 从vcf中获取已知位点
|
||||||
|
|
||||||
PerReadCovariateMatrix readCovariates; // 每个read对应的协变量矩阵
|
PerReadCovariateMatrix readCovariates; // 每个read对应的协变量矩阵
|
||||||
RecalTables recalTables; // 每个线程对应一个recal tables
|
RecalTables recalTables; // 每个线程对应一个recal tables,计算table的时候需要每个线程都有一个,输出bam的时候不需要,因为只读取,不写入
|
||||||
|
|
||||||
SamData sd;
|
SamData sd;
|
||||||
StableArray<int> isSNP, isIns, isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
StableArray<int> isSNP, isIns, isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
||||||
StableArray<uint8_t> baqArray;
|
StableArray<uint8_t> baqArray;
|
||||||
StableArray<double> snpErrors, insErrors, delErrors;
|
StableArray<double> snpErrors, insErrors, delErrors;
|
||||||
StableArray<uint8_t> skips; // 该位置是否是已知位点
|
StableArray<uint8_t> skips; // 该位置是否是已知位点
|
||||||
|
|
||||||
|
// only for apply bqsr
|
||||||
|
RecalTables* bqsrTable; // 保留一份就够
|
||||||
};
|
};
|
||||||
|
|
@ -11,6 +11,8 @@ Date : 2025/10/10
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "qual_utils.h"
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
|
|
@ -36,19 +38,19 @@ struct BQSRArg {
|
||||||
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
||||||
bool CREATE_INDEX = true;
|
bool CREATE_INDEX = true;
|
||||||
|
|
||||||
|
string INDEX_FILE;
|
||||||
|
|
||||||
nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI;
|
nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI;
|
||||||
|
|
||||||
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
|
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
|
||||||
bool ADD_PG_TAG_TO_READS = true;
|
bool ADD_PG_TAG_TO_READS = true;
|
||||||
|
|
||||||
//
|
// 命令行字符串
|
||||||
string CLI_STR;
|
string CLI_STR;
|
||||||
|
|
||||||
//
|
// 开始运行时间
|
||||||
string START_TIME;
|
string START_TIME;
|
||||||
|
|
||||||
string PROGRAM_RECORD_ID = "FastBQSR";
|
|
||||||
|
|
||||||
// reference file
|
// reference file
|
||||||
string REFERENCE_FILE;
|
string REFERENCE_FILE;
|
||||||
|
|
||||||
|
|
@ -137,14 +139,6 @@ struct BQSRArg {
|
||||||
*/
|
*/
|
||||||
double BAQGOP = 40;
|
double BAQGOP = 40;
|
||||||
|
|
||||||
/**
|
|
||||||
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in
|
|
||||||
* the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since base callers use these
|
|
||||||
* values to indicate random or bad bases. For example, Illumina writes Q2 bases when the machine has really gone wrong.
|
|
||||||
* This would be fine in and of itself, but when you select a subset of these reads based on their ability to align to the
|
|
||||||
* reference and their dinucleotide effect, your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
|
|
||||||
*/
|
|
||||||
int PRESERVE_QSCORES_LESS_THAN = 6;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了,用不到了?
|
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了,用不到了?
|
||||||
|
|
@ -152,7 +146,7 @@ struct BQSRArg {
|
||||||
bool enableBAQ = false;
|
bool enableBAQ = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* compute-indel-bqsr-tables, compute indel BQSR tables"
|
* compute-indel-bqsr-tables, compute indel BQSR tables" hidden
|
||||||
*/
|
*/
|
||||||
bool computeIndelBQSRTables = false;
|
bool computeIndelBQSRTables = false;
|
||||||
|
|
||||||
|
|
@ -162,16 +156,93 @@ struct BQSRArg {
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
|
||||||
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
|
|
||||||
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
|
|
||||||
* tag is present for a read, the standard qual score will be used.
|
|
||||||
*/
|
|
||||||
bool useOriginalBaseQualities = false;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If reads are missing some or all base quality scores, this value will be used for all base quality scores.
|
* If reads are missing some or all base quality scores, this value will be used for all base quality scores.
|
||||||
* By default this is set to -1 to disable default base quality assignment.
|
* By default this is set to -1 to disable default base quality assignment.
|
||||||
*/
|
*/
|
||||||
int8_t defaultBaseQualities = -1;
|
int8_t defaultBaseQualities = -1;
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// args for apply bqsr
|
||||||
|
string BQSR_FILE;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Turns on the base quantization module. It requires a recalibration report.
|
||||||
|
*
|
||||||
|
* A value of 0 here means "do not quantize".
|
||||||
|
* Any value greater than zero will be used to recalculate the quantization using that many levels.
|
||||||
|
* Negative values mean that we should quantize using the recalibration report's quantization level.
|
||||||
|
*/
|
||||||
|
// @Argument(fullName = "quantize-quals", doc = "Quantize quality scores to a given number of levels", optional = true)
|
||||||
|
int quantizationLevels = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Static quantized quals are entirely separate from the quantize_qual option which uses dynamic binning.
|
||||||
|
* The two types of binning should not be used together.
|
||||||
|
*
|
||||||
|
* For example, the Warp germline pipeline uses the static bins { 10, 20, 30, 40 }
|
||||||
|
*/
|
||||||
|
//@Advanced @Argument(fullName = static-quantized-quals,
|
||||||
|
// doc = "Use static quantized quality scores to a given number of levels (with -" +
|
||||||
|
// StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME + ")", optional = true, mutex = "quantize-quals")
|
||||||
|
vector<int> staticQuantizationQuals;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Round down quantized only works with the static_quantized_quals option, and should not be used with
|
||||||
|
* the dynamic binning option provided by quantize_quals. When roundDown = false, rounding is done in
|
||||||
|
* probability space to the nearest bin. When roundDown = true, the value is rounded to the nearest bin
|
||||||
|
* that is smaller than the current bin.
|
||||||
|
*/
|
||||||
|
// @Advanced @Argument(fullName = "round-down-quantized", doc = "Round quals down to nearest quantized qual", optional = true,
|
||||||
|
// mutex = "quantize-quals") public
|
||||||
|
bool roundDown = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The tool is capable of writing out the original quality scores of each read in the recalibrated output file
|
||||||
|
* under the "OQ" tag. By default, this behavior is disabled because emitting original qualities results in a
|
||||||
|
* significant increase of the file size. Use this flag to turn on emission of original qualities.
|
||||||
|
*/
|
||||||
|
//@Argument(fullName = "emit-original-quals", doc = "Emit original base qualities under the OQ tag",
|
||||||
|
// optional = true)
|
||||||
|
bool emitOriginalQuals = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If specified, the value of this argument will be used as a flat prior for all mismatching quality scores instead
|
||||||
|
* of the reported quality score (assigned by the sequencer).
|
||||||
|
*/
|
||||||
|
// @Argument(fullName = "global-qscore-prior", doc = "Global Qscore Bayesian prior to use for BQSR",
|
||||||
|
// optional = true)
|
||||||
|
double globalQScorePrior = -1.0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If set to true, do not throw an error upon encountering a read with a read group that's not in the recalibration table.
|
||||||
|
* Instead, simply set the quantized original base qualities as the recalibrated base qualities.
|
||||||
|
*/
|
||||||
|
// @Argument(fullName = allow-missing-read-group, doc = "Do not throw an error when encountering a read group not in the recal table",
|
||||||
|
// optional = true)
|
||||||
|
bool allowMissingReadGroups = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out
|
||||||
|
* unmodified in the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since
|
||||||
|
* base callers use these values to indicate random or bad bases. For example, Illumina writes Q2 bases when the
|
||||||
|
* machine has really gone wrong. This would be fine in and of itself, but when you select a subset of these reads
|
||||||
|
* based on their ability to align to the reference and their dinucleotide effect, your Q2 bin can be elevated to
|
||||||
|
* Q8 or Q10, leading to issues downstream.
|
||||||
|
*/
|
||||||
|
// @Argument(fullName = "preserve-qscores-less-than", doc = "Don't recalibrate bases with quality scores less than this threshold", optional = true,
|
||||||
|
// minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE)
|
||||||
|
int PRESERVE_QSCORES_LESS_THAN = QualityUtils::MIN_USABLE_Q_SCORE;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
|
||||||
|
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
|
||||||
|
* tag is present for a read, the standard quality score will be used.
|
||||||
|
*/
|
||||||
|
// @Argument(fullName = use-original-qualities, shortName = "OQ", doc = "Use the base quality scores from the OQ tag",
|
||||||
|
// optional = true)
|
||||||
|
bool useOriginalBaseQualities = false;
|
||||||
};
|
};
|
||||||
|
|
@ -1,13 +1,12 @@
|
||||||
/*
|
/*
|
||||||
Description:
|
Description: bqsr的第一阶段,生成协变量统计表的程序入口
|
||||||
bam,bam,bam
|
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
Author : Zhang Zhonghai
|
Author : Zhang Zhonghai
|
||||||
Date : 2023/10/23
|
Date : 2025/11/23
|
||||||
*/
|
*/
|
||||||
#include <header.h>
|
#include <header.h> // in htslib
|
||||||
#include <htslib/faidx.h>
|
#include <htslib/faidx.h>
|
||||||
#include <htslib/kstring.h>
|
#include <htslib/kstring.h>
|
||||||
#include <htslib/sam.h>
|
#include <htslib/sam.h>
|
||||||
|
|
@ -44,11 +43,10 @@ Date : 2023/10/23
|
||||||
#include "util/stable_array.h"
|
#include "util/stable_array.h"
|
||||||
#include "util/utils.h"
|
#include "util/utils.h"
|
||||||
#include "util/vcf_parser.h"
|
#include "util/vcf_parser.h"
|
||||||
|
#include "common_data.h"
|
||||||
|
|
||||||
using std::deque;
|
using std::deque;
|
||||||
|
|
||||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
|
|
||||||
|
|
||||||
namespace nsgv {
|
namespace nsgv {
|
||||||
|
|
||||||
// 全局变量 for bqsr
|
// 全局变量 for bqsr
|
||||||
|
|
@ -58,29 +56,6 @@ sam_hdr_t* gInBamHeader; // input BAM header
|
||||||
vector<AuxVar> gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个
|
vector<AuxVar> gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个
|
||||||
}; // namespace nsgv
|
}; // namespace nsgv
|
||||||
|
|
||||||
|
|
||||||
// 过滤掉bqsr过程不符合要求的bam数据
|
|
||||||
bool bqsrReadFilterOut(const bam1_t *b) {
|
|
||||||
// 过滤掉unmapped的read
|
|
||||||
if (b->core.qual == 0) // mapping quality 0
|
|
||||||
return true;
|
|
||||||
if (b->core.qual == 255) // mapping quality not available
|
|
||||||
return true;
|
|
||||||
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (b->core.flag & BAM_FDUP) { // secondary alignment
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 数据总结
|
// 数据总结
|
||||||
void collapseQualityScoreTableToReadGroupTable(Array2D<RecalDatum> &byReadGroupTable, Array3D<RecalDatum> &byQualTable) {
|
void collapseQualityScoreTableToReadGroupTable(Array2D<RecalDatum> &byReadGroupTable, Array3D<RecalDatum> &byQualTable) {
|
||||||
// 遍历quality table
|
// 遍历quality table
|
||||||
|
|
@ -137,7 +112,7 @@ static void printRecalTables(const RecalTables& rt) {
|
||||||
// 串行bqsr
|
// 串行bqsr
|
||||||
int SerialBQSR(AuxVar &aux) {
|
int SerialBQSR(AuxVar &aux) {
|
||||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
|
||||||
int64_t readNumSum = 0;
|
int64_t readNumSum = 0;
|
||||||
int round = 0;
|
int round = 0;
|
||||||
|
|
||||||
|
|
@ -227,7 +202,7 @@ int SerialBQSR(AuxVar &aux) {
|
||||||
|
|
||||||
// 8. 计算这条read对应的协变量
|
// 8. 计算这条read对应的协变量
|
||||||
PROF_START(GP_covariate);
|
PROF_START(GP_covariate);
|
||||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, 0);
|
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
|
||||||
PROF_GP_END(GP_covariate);
|
PROF_GP_END(GP_covariate);
|
||||||
|
|
||||||
// fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len);
|
// fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len);
|
||||||
|
|
@ -251,7 +226,7 @@ int SerialBQSR(AuxVar &aux) {
|
||||||
|
|
||||||
// 9. 计算这条read需要跳过的位置
|
// 9. 计算这条read需要跳过的位置
|
||||||
PROF_START(GP_read_vcf);
|
PROF_START(GP_read_vcf);
|
||||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, 0);
|
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, RecalFuncs::MAX_SITES_INTERVAL, skips, 0);
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) {
|
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||||
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
|
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
|
||||||
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
|
|
@ -336,6 +311,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
|
||||||
int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size());
|
int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size());
|
||||||
#endif
|
#endif
|
||||||
// spdlog::info("tid {}, index {}, steal {}", tid, idx, steal);
|
// spdlog::info("tid {}, index {}, steal {}", tid, idx, steal);
|
||||||
|
// spdlog::info("interval span: {}", bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos());
|
||||||
|
int sitesStride = bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos();
|
||||||
|
// sitesStride = sitesStride >= RecalFuncs::MAX_SITES_INTERVAL ? sitesStride : RecalFuncs::MAX_SITES_INTERVAL;
|
||||||
|
sitesStride = RecalFuncs::MAX_SITES_INTERVAL;
|
||||||
aux.threadProcessedReads += stopIdx - startIdx;
|
aux.threadProcessedReads += stopIdx - startIdx;
|
||||||
for (int i = startIdx; i < stopIdx; ++i) {
|
for (int i = startIdx; i < stopIdx; ++i) {
|
||||||
// spdlog::info("Thread {} processing read idx: {}", tid, i);
|
// spdlog::info("Thread {} processing read idx: {}", tid, i);
|
||||||
|
|
@ -364,10 +343,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
|
||||||
if (!baqCalculated) continue;
|
if (!baqCalculated) continue;
|
||||||
|
|
||||||
PROF_START(TP_covariate);
|
PROF_START(TP_covariate);
|
||||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, thid);
|
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, thid);
|
||||||
PROF_TP_END(TP_covariate);
|
PROF_TP_END(TP_covariate);
|
||||||
|
|
||||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, thid);
|
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, sitesStride, skips, thid);
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) {
|
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||||
skips[ii] =
|
skips[ii] =
|
||||||
skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
|
|
@ -390,7 +369,7 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
|
||||||
// 并行bqsr
|
// 并行bqsr
|
||||||
int ParallelBQSR(vector<AuxVar>& auxArr) {
|
int ParallelBQSR(vector<AuxVar>& auxArr) {
|
||||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
|
||||||
int64_t readNumSum = 0;
|
int64_t readNumSum = 0;
|
||||||
int round = 0;
|
int round = 0;
|
||||||
|
|
||||||
|
|
@ -484,8 +463,8 @@ static void globalInit() {
|
||||||
|
|
||||||
/* 并行读取bam数据 */
|
/* 并行读取bam数据 */
|
||||||
htsThreadPool htsPoolRead = {NULL, 0}; //
|
htsThreadPool htsPoolRead = {NULL, 0}; //
|
||||||
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, 8);
|
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
|
||||||
htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
|
htsPoolRead.pool = hts_tpool_init(readThreadNum);
|
||||||
if (!htsPoolRead.pool ) {
|
if (!htsPoolRead.pool ) {
|
||||||
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||||||
sam_close(nsgv::gInBamFp);
|
sam_close(nsgv::gInBamFp);
|
||||||
|
|
@ -549,9 +528,10 @@ static void globalInit() {
|
||||||
// 全局资源释放
|
// 全局资源释放
|
||||||
static void globalDestroy() {
|
static void globalDestroy() {
|
||||||
close_debug_files();
|
close_debug_files();
|
||||||
|
sam_close(nsgv::gInBamFp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// entrance of mark BQSR
|
// entrance of BQSR phase-1
|
||||||
int BaseRecalibrator() {
|
int BaseRecalibrator() {
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
|
|
@ -562,7 +542,6 @@ int BaseRecalibrator() {
|
||||||
else
|
else
|
||||||
ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal table
|
ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal table
|
||||||
globalDestroy();
|
globalDestroy();
|
||||||
sam_close(nsgv::gInBamFp);
|
|
||||||
PROF_GP_END(GP_whole_process);
|
PROF_GP_END(GP_whole_process);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
/*
|
||||||
|
Description: 共用的一些宏定义,全局变量等
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2026/01/01
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "bqsr_args.h"
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
#include "aux_arg.h"
|
||||||
|
#include "fastbqsr_version.h"
|
||||||
|
|
||||||
|
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
|
||||||
|
#define BAM_READ_MAX_THREAD 8
|
||||||
|
#define BAM_WRITE_MAX_THREAD 16
|
||||||
|
|
||||||
|
#define PROGRAM_NAME "FastBQSR"
|
||||||
|
|
||||||
|
namespace nsgv {
|
||||||
|
|
||||||
|
// 全局变量 for bqsr
|
||||||
|
extern BQSRArg gBqsrArg; // bqsr arguments
|
||||||
|
extern samFile* gInBamFp; // input BAM file pointer
|
||||||
|
extern sam_hdr_t* gInBamHeader; // input BAM header
|
||||||
|
extern vector<AuxVar> gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个
|
||||||
|
}; // namespace nsgv
|
||||||
|
|
@ -250,19 +250,20 @@ void ContextCovariate::GetStrandedClippedBytes(SamData& sd, StableArray<char>& c
|
||||||
* @param end the end position in the array (exclusive)
|
* @param end the end position in the array (exclusive)
|
||||||
* @return the key representing the dna sequence
|
* @return the key representing the dna sequence
|
||||||
*/
|
*/
|
||||||
int ContextCovariate::KeyFromContext(const StableArray<char>& dna, const int start, const int end) {
|
// template<typename Arr> // 好像是类的模板函数必须在头文件里实现
|
||||||
int key = end - start;
|
// int ContextCovariate::KeyFromContext(const Arr& dna, const int start, const int end) {
|
||||||
int bitOffset = LENGTH_BITS;
|
// int key = end - start;
|
||||||
for (int i = start; i < end; i++) {
|
// int bitOffset = LENGTH_BITS;
|
||||||
const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
// for (int i = start; i < end; i++) {
|
||||||
if (baseIndex == -1) { // ignore non-ACGT bases
|
// const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||||||
return -1;
|
// if (baseIndex == -1) { // ignore non-ACGT bases
|
||||||
}
|
// return -1;
|
||||||
key |= (baseIndex << bitOffset);
|
// }
|
||||||
bitOffset += 2;
|
// key |= (baseIndex << bitOffset);
|
||||||
}
|
// bitOffset += 2;
|
||||||
return key;
|
// }
|
||||||
}
|
// return key;
|
||||||
|
// }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||||
|
|
@ -334,6 +335,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const StableArray<char>& bas
|
||||||
void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
const int originalReadLength = sd.read_len;
|
const int originalReadLength = sd.read_len;
|
||||||
|
|
||||||
|
const char* qname = bam_get_qname(sd.bw->b);
|
||||||
// store the original bases and then write Ns over low quality ones
|
// store the original bases and then write Ns over low quality ones
|
||||||
auto &strandedClippedBases = sd.strandedClippedBases;
|
auto &strandedClippedBases = sd.strandedClippedBases;
|
||||||
strandedClippedBases.copy(sd.bases);
|
strandedClippedBases.copy(sd.bases);
|
||||||
|
|
@ -348,13 +350,13 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
|
||||||
// since the context covariate may not span the entire set of values in read covariates
|
// since the context covariate may not span the entire set of values in read covariates
|
||||||
// due to the clipping of the low quality bases
|
// due to the clipping of the low quality bases
|
||||||
// 这段代码应该不会执行,因为clip with N不会改变read长度
|
// 这段代码应该不会执行,因为clip with N不会改变read长度
|
||||||
if (readLengthAfterClipping != originalReadLength) {
|
// if (readLengthAfterClipping != originalReadLength) {
|
||||||
// don't bother zeroing out if we are going to overwrite the whole array
|
// // don't bother zeroing out if we are going to overwrite the whole array
|
||||||
for (int i = 0; i < originalReadLength; i++) {
|
// for (int i = 0; i < originalReadLength; i++) {
|
||||||
// this base has been clipped off, so zero out the covariate values here
|
// // this base has been clipped off, so zero out the covariate values here
|
||||||
CovariateUtils::SetContext(0, 0, 0, i, values);
|
// CovariateUtils::SetContext(0, 0, 0, i, values);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag();
|
const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag();
|
||||||
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ using std::vector;
|
||||||
|
|
||||||
// 协变量的值, 4个协变量
|
// 协变量的值, 4个协变量
|
||||||
struct CovariateValues {
|
struct CovariateValues {
|
||||||
int readGroup = 0;
|
int readGroup = 0; // 默认是read group 0
|
||||||
int baseQuality = 0;
|
int baseQuality = 0;
|
||||||
int context = -1;
|
int context = -1;
|
||||||
int cycle = -1;
|
int cycle = -1;
|
||||||
|
|
@ -61,6 +61,16 @@ struct EventType {
|
||||||
static EventTypeValue BASE_INSERTION;
|
static EventTypeValue BASE_INSERTION;
|
||||||
static EventTypeValue BASE_DELETION;
|
static EventTypeValue BASE_DELETION;
|
||||||
static vector<EventTypeValue> EVENTS;
|
static vector<EventTypeValue> EVENTS;
|
||||||
|
static int GetIndexForEventRep(char eventType) {
|
||||||
|
if (eventType == BASE_SUBSTITUTION.representation) {
|
||||||
|
return BASE_SUBSTITUTION.index;
|
||||||
|
} else if (eventType == BASE_INSERTION.representation) {
|
||||||
|
return BASE_DELETION.index;
|
||||||
|
} else if (eventType == BASE_DELETION.representation) {
|
||||||
|
return BASE_DELETION.index;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Read group协变量
|
// Read group协变量
|
||||||
|
|
@ -201,7 +211,22 @@ struct ContextCovariate {
|
||||||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
|
static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
|
||||||
// Creates a int representation of a given dna string.
|
// Creates a int representation of a given dna string.
|
||||||
static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
|
template <typename Arr>
|
||||||
|
static int KeyFromContext(const Arr& dna, const int start, const int end) {
|
||||||
|
int key = end - start;
|
||||||
|
int bitOffset = LENGTH_BITS;
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||||||
|
if (baseIndex == -1) { // ignore non-ACGT bases
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
key |= (baseIndex << bitOffset);
|
||||||
|
bitOffset += 2;
|
||||||
|
}
|
||||||
|
return key;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int KeyFromContext(const string& dna) { return KeyFromContext(dna, 0, dna.size()); }
|
||||||
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||||
static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);
|
static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);
|
||||||
|
|
||||||
|
|
@ -219,6 +244,7 @@ struct CycleCovariate {
|
||||||
|
|
||||||
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
|
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
|
||||||
|
|
||||||
|
static int KeyFromCycle(const int cycle) { return KeyFromCycle(cycle, MAXIMUM_CYCLE_VALUE); }
|
||||||
/**
|
/**
|
||||||
* Encodes the cycle number as a key.
|
* Encodes the cycle number as a key.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
Description: apply bqsr过程中对sam的一些修改函数
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2026/01/02
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
struct ReadUtils {
|
||||||
|
// sam/bam文件中的quality score要在原来的quality上加33,应该是映射到字符区,方便用字母表示
|
||||||
|
// htslib解析之后已经减掉了,就是真实的质量分数
|
||||||
|
static constexpr int QUALITY_SCORE_ADD_IN_FILE = 33;
|
||||||
|
/**
|
||||||
|
* Resets the quality scores of the reads to the orginal (pre-BQSR) ones.
|
||||||
|
*/
|
||||||
|
static void resetOriginalBaseQualities(bam1_t *b) {
|
||||||
|
uint8_t* oq = bam_aux_get(b, "OQ");
|
||||||
|
char* oqVal = nullptr;
|
||||||
|
if (oq)
|
||||||
|
oqVal = bam_aux2Z(oq);
|
||||||
|
int crg = 0;
|
||||||
|
if (oqVal == nullptr) {
|
||||||
|
} else {
|
||||||
|
uint8_t* quals = bam_get_qual(b);
|
||||||
|
for (int i = 0; i < b->core.l_qseq; ++i) {
|
||||||
|
quals[i] = oqVal[i] - QUALITY_SCORE_ADD_IN_FILE; // 要减掉33
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get read group id
|
||||||
|
static const char* getReadGroupId(bam1_t *b) {
|
||||||
|
uint8_t* rgStr = bam_aux_get(b, "RG");
|
||||||
|
char* rgVal = nullptr;
|
||||||
|
if (rgStr)
|
||||||
|
rgVal = bam_aux2Z(rgStr);
|
||||||
|
return rgVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 将当前quals保存在OQ tag中 (如果当前read不存在OQ tag的情况下)
|
||||||
|
static void setOriginalBaseQualsIfNoOQ(bam1_t *b) {
|
||||||
|
uint8_t* tagData = bam_aux_get(b, "OQ");
|
||||||
|
if (tagData)
|
||||||
|
return;
|
||||||
|
char* tagVal = nullptr;
|
||||||
|
const char* oldQual = (char*)bam_get_qual(b);
|
||||||
|
string qual(b->core.l_qseq + 1, '\0');
|
||||||
|
for (int i = 0; i < qual.size() - 1; ++i) {
|
||||||
|
qual[i] = oldQual[i] + QUALITY_SCORE_ADD_IN_FILE;
|
||||||
|
}
|
||||||
|
// bam_aux_append 最后一个字符必须是'\0'
|
||||||
|
if (bam_aux_append(b, "OQ", 'Z', qual.size(), (const uint8_t*)qual.c_str()) != 0) {
|
||||||
|
spdlog::error("Add OQ (original quality score) tag failed. ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 移除给定的tag
|
||||||
|
static void removeAttribute(bam1_t *b, const string &tag) {
|
||||||
|
uint8_t* tagData = bam_aux_get(b, "OQ");
|
||||||
|
if (!tagData)
|
||||||
|
return;
|
||||||
|
if (bam_aux_remove(b, tagData) == nullptr) {
|
||||||
|
spdlog::error("Remove tag {} failed. ", tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -147,14 +147,18 @@ struct RecalDatum {
|
||||||
*/
|
*/
|
||||||
inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
|
inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
|
||||||
inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
|
inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
|
||||||
|
inline void setNumMismatches(double val) { numMismatches = val * MULTIPLIER; }
|
||||||
inline uint64_t getNumObservations() const { return numObservations; }
|
inline uint64_t getNumObservations() const { return numObservations; }
|
||||||
|
inline void setNumObservations(uint64_t val) { numObservations = val; }
|
||||||
inline double getReportedQuality() const { return reportedQuality; }
|
inline double getReportedQuality() const { return reportedQuality; }
|
||||||
|
inline void setReportedQuality(double val) { reportedQuality = val; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes the empirical quality of the datum, using the reported quality as the prior.
|
* Computes the empirical quality of the datum, using the reported quality as the prior.
|
||||||
* @see #getEmpiricalQuality(double) below.
|
* @see #getEmpiricalQuality(double) below.
|
||||||
*/
|
*/
|
||||||
double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
|
double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
|
||||||
|
inline void setEmpiricalQuality(double val) { empiricalQuality = val; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.
|
* Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.
|
||||||
|
|
|
||||||
|
|
@ -8,19 +8,55 @@
|
||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "util/stable_array.h"
|
#include <spdlog/spdlog.h>
|
||||||
#include "util/sam_data.h"
|
#include <numeric>
|
||||||
|
|
||||||
#include "bqsr/aux_arg.h"
|
#include "bqsr/aux_arg.h"
|
||||||
#include "util/bam_wrap.h"
|
#include "util/bam_wrap.h"
|
||||||
#include "util/interval.h"
|
|
||||||
#include "util/vcf_parser.h"
|
|
||||||
#include "util/profiling.h"
|
|
||||||
#include "util/debug.h"
|
#include "util/debug.h"
|
||||||
|
#include "util/interval.h"
|
||||||
|
#include "util/profiling.h"
|
||||||
|
#include "util/sam_data.h"
|
||||||
|
#include "util/stable_array.h"
|
||||||
|
#include "util/vcf_parser.h"
|
||||||
|
|
||||||
struct RecalFuncs {
|
struct RecalFuncs {
|
||||||
static constexpr int MAX_SITES_INTERVAL = 100000;
|
//static constexpr int MAX_SITES_INTERVAL = 100000;
|
||||||
|
static constexpr int MAX_SITES_INTERVAL = 21500; // 经验值,这个数读取vcf和计算的性能最好
|
||||||
static constexpr uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
|
static constexpr uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
|
||||||
|
|
||||||
|
// 过滤掉bqsr过程不符合要求的bam数据
|
||||||
|
static bool bqsrReadFilterOut(const bam1_t* b) {
|
||||||
|
// 过滤掉unmapped的read
|
||||||
|
if (b->core.qual == 0) // mapping quality 0
|
||||||
|
return true;
|
||||||
|
if (b->core.qual == 255) // mapping quality not available
|
||||||
|
return true;
|
||||||
|
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (b->core.flag & BAM_FDUP) { // secondary alignment
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 过滤掉apply bqsr过程不符合要求的bam数据
|
||||||
|
static bool applyBqsrReadFilterOut(const bam1_t* b) {
|
||||||
|
// 好像4.6.2的GATK版本的welformed filter啥也没过滤
|
||||||
|
// b->core.flag & BAM_FUNMAP ||
|
||||||
|
if ((b->core.tid == -1 || b->core.pos == -1) && !(b->core.flag & BAM_FMUNMAP)) { // unmapped
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// 设置某个位置是indel
|
// 设置某个位置是indel
|
||||||
static inline void updateIndel(StableArray<int>& isIndel, int index) {
|
static inline void updateIndel(StableArray<int>& isIndel, int index) {
|
||||||
if (index >= 0 && index < isIndel.size()) {
|
if (index >= 0 && index < isIndel.size()) {
|
||||||
|
|
@ -94,18 +130,16 @@ struct RecalFuncs {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取一行字符串
|
// 获取一行字符串
|
||||||
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
|
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur) {
|
||||||
line->clear();
|
if (*cur >= total) {
|
||||||
if (*cur >= total)
|
(*cur)++;
|
||||||
return;
|
return;
|
||||||
char b;
|
|
||||||
while (*cur < total && (b = buf[(*cur)++]) != '\n') {
|
|
||||||
line->push_back(b);
|
|
||||||
}
|
}
|
||||||
|
while (*cur < total && buf[(*cur)++] != '\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||||
static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, StableArray<uint8_t>& knownSites, int thid) {
|
static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, int sitesStride, StableArray<uint8_t>& knownSites, int thid) {
|
||||||
BamWrap* bw = sd.bw;
|
BamWrap* bw = sd.bw;
|
||||||
int tid = bw->contig_id();
|
int tid = bw->contig_id();
|
||||||
int64_t startPos = bw->start_pos(); // 闭区间,使用clip之前的read匹配的范围
|
int64_t startPos = bw->start_pos(); // 闭区间,使用clip之前的read匹配的范围
|
||||||
|
|
@ -113,7 +147,6 @@ struct RecalFuncs {
|
||||||
knownSites.resize_fill(sd.read_len, 0);
|
knownSites.resize_fill(sd.read_len, 0);
|
||||||
|
|
||||||
// update vcfs
|
// update vcfs
|
||||||
// int idx = 0;
|
|
||||||
PROF_START(TP_read_vcf);
|
PROF_START(TP_read_vcf);
|
||||||
for (auto& vcf : vcfs) {
|
for (auto& vcf : vcfs) {
|
||||||
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam,不需要读取
|
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam,不需要读取
|
||||||
|
|
@ -130,7 +163,7 @@ struct RecalFuncs {
|
||||||
vcf.knownSites.clear(); // 清空,因为后面会读入覆盖bam的所有vcf位点
|
vcf.knownSites.clear(); // 清空,因为后面会读入覆盖bam的所有vcf位点
|
||||||
// 读取新的interval
|
// 读取新的interval
|
||||||
int64_t fpos, flen;
|
int64_t fpos, flen;
|
||||||
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
endPos = std::max(startPos + sitesStride, endPos);
|
||||||
Interval readIntv(startPos, endPos);
|
Interval readIntv(startPos, endPos);
|
||||||
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
||||||
// fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen);
|
// fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen);
|
||||||
|
|
@ -143,10 +176,12 @@ struct RecalFuncs {
|
||||||
}
|
}
|
||||||
char* buf = vcf.buf;
|
char* buf = vcf.buf;
|
||||||
vcf.inStm.read(buf, flen);
|
vcf.inStm.read(buf, flen);
|
||||||
string line;
|
|
||||||
int64_t cur = 0;
|
int64_t cur = 0;
|
||||||
get_line_from_buf(buf, flen, &cur, &line);
|
int64_t start = 0;
|
||||||
while (line.size() > 0) {
|
get_line_from_buf(buf, flen, &cur);
|
||||||
|
while (cur > start + 1) {
|
||||||
|
const string line(buf + start, buf + cur - 1);
|
||||||
|
// spdlog::info("s: {}, e: {}, line: {}", start, cur, line);
|
||||||
stringstream ss_line(line);
|
stringstream ss_line(line);
|
||||||
string stid;
|
string stid;
|
||||||
int tid, pos;
|
int tid, pos;
|
||||||
|
|
@ -159,7 +194,8 @@ struct RecalFuncs {
|
||||||
if (varIntv.overlaps(readIntv)) {
|
if (varIntv.overlaps(readIntv)) {
|
||||||
vcf.knownSites.push_back(varIntv); // 闭区间
|
vcf.knownSites.push_back(varIntv); // 闭区间
|
||||||
}
|
}
|
||||||
get_line_from_buf(buf, flen, &cur, &line);
|
start = cur;
|
||||||
|
get_line_from_buf(buf, flen, &cur);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -195,7 +231,6 @@ struct RecalFuncs {
|
||||||
knownSites[i] = true;
|
knownSites[i] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//idx += 1;
|
|
||||||
}
|
}
|
||||||
PROF_TP_END(TP_calc_skips);
|
PROF_TP_END(TP_calc_skips);
|
||||||
//fprintf(gf[0], "\n");
|
//fprintf(gf[0], "\n");
|
||||||
|
|
@ -241,4 +276,53 @@ struct RecalFuncs {
|
||||||
calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
|
calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// for apply bqsr
|
||||||
|
/**
|
||||||
|
* Quality score recalibration algorithm works as follows:
|
||||||
|
* - Start with the (approximate, or "estimated") reported quality score. (Approximation occurs when marginalizing/collapsing
|
||||||
|
* over the reported qualities for each read group).
|
||||||
|
* - Compute (retrieve?) the empirical quality score using the per-read group datum (i.e. counts). Call it y_1.
|
||||||
|
* - Use y_1 just computed as the prior for the empirical quality score for the datum for the 2-tuple ( read group, quality score). Call it y_2.
|
||||||
|
* - Use y_2 as the prior to compute the empirical quality for the 3-tuple ( read-group, quality-score, special covariate ). Call it y_3 for the
|
||||||
|
* context covariate. Similarly define y_4 for the cycle covariate. Let d_3 = y_3 - y_2, d_4 = y_4 - y_2.
|
||||||
|
* - (final recalibrated score) = y_2 + d_3 + d_4 = y_3 + y_4 - y_2.
|
||||||
|
*
|
||||||
|
* @param priorQualityScore the prior quality score (in log space). It is either the "estimated" or collapsed reported quality score
|
||||||
|
* for the read group, or the constant prior if given. This value has type double because of the "combine" (or collapse)
|
||||||
|
* operation that collapses the quality scores represented within the same read group.
|
||||||
|
* @param readGroupDatum the RecalDatum object for a particular read group at hand. May be null.
|
||||||
|
* @param qualityScoreDatum the RecalDatum object for a particular (read group, reported quality) tuple at hand. May be null.
|
||||||
|
* @param specialCovariateDatums the array of RecalDatum objects for the non-required covariates (cycle and context covariates by default).
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
static double hierarchicalBayesianQualityEstimate(double priorQualityScore, RecalDatum& readGroupDatum, RecalDatum& qualityScoreDatum,
|
||||||
|
RecalDatum& contextDatum, RecalDatum& cycleDatum) {
|
||||||
|
double empiricalQualityForReadGroup =
|
||||||
|
readGroupDatum.getNumObservations() == 0 ? priorQualityScore : readGroupDatum.getEmpiricalQuality(priorQualityScore);
|
||||||
|
double posteriorEmpiricalQualityForReportedQuality = qualityScoreDatum.getNumObservations() == 0
|
||||||
|
? empiricalQualityForReadGroup
|
||||||
|
: qualityScoreDatum.getEmpiricalQuality(empiricalQualityForReadGroup);
|
||||||
|
|
||||||
|
double deltaSpecialCovariates = 0.0;
|
||||||
|
// At this point we stop being iterative; the special covariates (context and cycle by default) are treated differently.
|
||||||
|
if (contextDatum.getNumObservations() > 0) {
|
||||||
|
// TODO: the prior is ignored if the empirical quality for the datum is already cached.
|
||||||
|
deltaSpecialCovariates +=
|
||||||
|
contextDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
|
||||||
|
}
|
||||||
|
if (cycleDatum.getNumObservations() > 0) {
|
||||||
|
// TODO: the prior is ignored if the empirical quality for the datum is already cached.
|
||||||
|
deltaSpecialCovariates +=
|
||||||
|
cycleDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
|
||||||
|
}
|
||||||
|
|
||||||
|
return posteriorEmpiricalQualityForReportedQuality + deltaSpecialCovariates;
|
||||||
|
}
|
||||||
|
|
||||||
|
// recalibrated quality is bound between 1 and MAX_QUAL
|
||||||
|
static uint8_t getBoundedIntegerQual(double recalibratedQualDouble) {
|
||||||
|
return QualityUtils::boundQual(MathUtils::fastRound(recalibratedQualDouble), RecalDatum::MAX_RECALIBRATED_Q_SCORE);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -11,9 +11,10 @@
|
||||||
#include "covariate.h"
|
#include "covariate.h"
|
||||||
#include "nested_array.h"
|
#include "nested_array.h"
|
||||||
#include "recal_datum.h"
|
#include "recal_datum.h"
|
||||||
|
#include "qual_utils.h"
|
||||||
|
|
||||||
struct RecalTables {
|
struct RecalTables {
|
||||||
int qualDimension = 94; // MAX_SAM_QUAL_SCORE(93) + 1
|
int qualDimension = QualityUtils::MAX_SAM_QUAL_SCORE + 1; // MAX_SAM_QUAL_SCORE(93) + 1
|
||||||
int eventDimension = EventType::EVENT_SIZE;
|
int eventDimension = EventType::EVENT_SIZE;
|
||||||
int numReadGroups;
|
int numReadGroups;
|
||||||
|
|
||||||
|
|
@ -41,4 +42,22 @@ struct RecalTables {
|
||||||
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
|
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
|
||||||
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
|
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void initReadGroupTable(int _numReadGroups) {
|
||||||
|
numReadGroups = _numReadGroups;
|
||||||
|
readGroupTable.init(eventDimension, numReadGroups);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 必须在调用initReadGroupTable之后,或者设置了numReadGroups之后
|
||||||
|
void initQualityScoreTable() {
|
||||||
|
qualityScoreTable.init(eventDimension, numReadGroups, qualDimension);
|
||||||
|
}
|
||||||
|
|
||||||
|
void initContextTable() {
|
||||||
|
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void initCycleTable() {
|
||||||
|
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -11,6 +11,8 @@
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
#include "bqsr_args.h"
|
#include "bqsr_args.h"
|
||||||
#include "nested_array.h"
|
#include "nested_array.h"
|
||||||
|
|
@ -79,25 +81,6 @@ struct RecalUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// fprintf(gf[4], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
|
||||||
// _Foreach3D(qualityScoreTable, val, {
|
|
||||||
// if (val.numObservations > 0)
|
|
||||||
// fprintf(gf[4], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
|
||||||
// });
|
|
||||||
// fprintf(gf[4], "\n");
|
|
||||||
|
|
||||||
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
|
||||||
// for (auto& arr1 : contextTable.data) {
|
|
||||||
// for (size_t si = 0; si < arr1.size(); ++si) {
|
|
||||||
// for (auto &arr2 : arr1[si]) {
|
|
||||||
// for (auto& val : arr2) {
|
|
||||||
// if (val.numObservations > 0)
|
|
||||||
// fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// fprintf(gf[3], "\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 输出bqsr报告
|
// 输出bqsr报告
|
||||||
|
|
@ -110,7 +93,7 @@ struct RecalUtils {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
// 输出version信息
|
// 输出version信息
|
||||||
fprintf(fpout, "%s\n", REPORT_HEADER_VERSION);
|
fprintf(fpout, "%s:%s\n", REPORT_HEADER_VERSION, REPORT_HEADER_MINOR_VERSION);
|
||||||
// 输出参数信息
|
// 输出参数信息
|
||||||
outputArgsTable(RAC, fpout);
|
outputArgsTable(RAC, fpout);
|
||||||
// 输出量化质量分数信息
|
// 输出量化质量分数信息
|
||||||
|
|
@ -268,4 +251,220 @@ struct RecalUtils {
|
||||||
});
|
});
|
||||||
table.write(fpout);
|
table.write(fpout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 解析bqsr table文件
|
||||||
|
static void readRecalTables(const string& recalTableFn, BQSRArg& RAC, vector<uint8_t>& quantizedScore, RecalTables& recalTables) {
|
||||||
|
std::ifstream infs(recalTableFn, ifstream::in);
|
||||||
|
string line;
|
||||||
|
// 读取文件头
|
||||||
|
std::getline(infs, line);
|
||||||
|
spdlog::info("header: {}", line);
|
||||||
|
if (line.find(REPORT_HEADER_VERSION) != 0) {
|
||||||
|
spdlog::error("BQSR table version not supported! {} {}", line, __LINE__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取参数表
|
||||||
|
readArgsTable(infs, RAC);
|
||||||
|
|
||||||
|
readQuantTable(infs, quantizedScore);
|
||||||
|
|
||||||
|
readReadGroupTable(infs, recalTables);
|
||||||
|
|
||||||
|
readQualityScoreTable(infs, recalTables);
|
||||||
|
|
||||||
|
readContextCycleTable(infs, recalTables);
|
||||||
|
|
||||||
|
infs.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Arr>
|
||||||
|
static void split(const string& str, Arr& dat) {
|
||||||
|
dat.clear();
|
||||||
|
std::istringstream iss(str);
|
||||||
|
string item;
|
||||||
|
while (iss >> item) {
|
||||||
|
dat.push_back(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Arr>
|
||||||
|
static void split(const string& str, const char delimiter, Arr& dat) {
|
||||||
|
dat.clear();
|
||||||
|
std::istringstream iss(str);
|
||||||
|
string token;
|
||||||
|
while (std::getline(iss, token, delimiter)) {
|
||||||
|
dat.push_back(token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取参数表
|
||||||
|
static void readArgsTable(std::ifstream& infs, BQSRArg& p) {
|
||||||
|
string line;
|
||||||
|
StableArray<string> dat;
|
||||||
|
std::getline(infs, line);
|
||||||
|
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||||
|
split(line, ':', dat);
|
||||||
|
int argRowCnt = std::stoi(dat[3]);
|
||||||
|
std::getline(infs, line); // 忽略表格说明的一行
|
||||||
|
std::getline(infs, line); // 忽略表头column说明
|
||||||
|
for (int i = 0; i < argRowCnt; ++i) {
|
||||||
|
std::getline(infs, line);
|
||||||
|
split(line, dat);
|
||||||
|
if ("covariate" == dat[0]) {
|
||||||
|
} else if ("no_standard_covs" == dat[0]) {
|
||||||
|
p.DO_NOT_USE_STANDARD_COVARIATES = ReportUtil::ToBool(dat[1]);
|
||||||
|
} else if ("run_without_dbsnp" == dat[0]) {
|
||||||
|
p.RUN_WITHOUT_DBSNP = ReportUtil::ToBool(dat[1]);
|
||||||
|
} else if ("solid_recal_mode" == dat[0]) {
|
||||||
|
p.SOLID_RECAL_MODE = ReportUtil::ParseString(dat[1]);
|
||||||
|
} else if ("solid_nocall_strategy" == dat[0]) {
|
||||||
|
p.SOLID_NOCALL_STRATEGY = ReportUtil::ParseString(dat[1]);
|
||||||
|
} else if ("mismatches_context_size" == dat[0]) {
|
||||||
|
p.MISMATCHES_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("indels_context_size" == dat[0]) {
|
||||||
|
p.INDELS_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("mismatches_default_quality" == dat[0]) {
|
||||||
|
p.MISMATCHES_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
|
||||||
|
// spdlog::info("int8_t : {}", (int)p.MISMATCHES_DEFAULT_QUALITY);
|
||||||
|
} else if ("deletions_default_quality" == dat[0]) {
|
||||||
|
p.DELETIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("insertions_default_quality" == dat[0]) {
|
||||||
|
p.INSERTIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("maximum_cycle_value" == dat[0]) {
|
||||||
|
p.MAXIMUM_CYCLE_VALUE = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("low_quality_tail" == dat[0]) {
|
||||||
|
p.LOW_QUAL_TAIL = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("default_platform" == dat[0]) {
|
||||||
|
p.DEFAULT_PLATFORM = ReportUtil::ParseString(dat[1]);
|
||||||
|
} else if ("force_platform" == dat[0]) {
|
||||||
|
p.FORCE_PLATFORM = ReportUtil::ParseString(dat[1]);
|
||||||
|
} else if ("quantizing_levels" == dat[0]) {
|
||||||
|
p.QUANTIZING_LEVELS = ReportUtil::ToInt(dat[1]);
|
||||||
|
} else if ("recalibration_report" == dat[0]) {
|
||||||
|
p.existingRecalibrationReport = ReportUtil::ParseString(dat[1]);
|
||||||
|
} else if ("binary_tag_name" == dat[0]) {
|
||||||
|
p.BINARY_TAG_NAME = ReportUtil::ParseString(dat[1]);
|
||||||
|
// spdlog::info("BINARY_TAG_NAME : {}", p.BINARY_TAG_NAME);
|
||||||
|
} else {
|
||||||
|
spdlog::error("unknown argument : {} {}", dat[0], dat[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取量化质量分数信息表
|
||||||
|
static void readQuantTable(std::ifstream& infs, vector<uint8_t>& quantizedScore) {
|
||||||
|
string line;
|
||||||
|
StableArray<string> dat;
|
||||||
|
std::getline(infs, line);
|
||||||
|
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||||
|
split(line, ':', dat);
|
||||||
|
int argRowCnt = std::stoi(dat[3]);
|
||||||
|
std::getline(infs, line); // 忽略表格说明的一行
|
||||||
|
std::getline(infs, line); // 忽略表头column说明
|
||||||
|
quantizedScore.resize(argRowCnt);
|
||||||
|
for (int i = 0; i < argRowCnt; ++i) {
|
||||||
|
std::getline(infs, line);
|
||||||
|
// spdlog::info("{}", line);
|
||||||
|
split(line, dat);
|
||||||
|
quantizedScore[ReportUtil::ToInt(dat[0])] = ReportUtil::ToInt(dat[2]);
|
||||||
|
// spdlog::info("{}, {}", ReportUtil::ToInt(dat[0]), quantizedScore[ReportUtil::ToInt(dat[0])]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取read group表
|
||||||
|
static void readReadGroupTable(std::ifstream& infs, RecalTables& recalTables) {
|
||||||
|
string line;
|
||||||
|
StableArray<string> dat;
|
||||||
|
std::getline(infs, line);
|
||||||
|
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||||
|
split(line, ':', dat);
|
||||||
|
int argRowCnt = std::stoi(dat[3]);
|
||||||
|
std::getline(infs, line); // 忽略表格说明的一行
|
||||||
|
std::getline(infs, line); // 忽略表头column说明
|
||||||
|
vector<vector<string>> dats(argRowCnt);
|
||||||
|
int rgId = 0;
|
||||||
|
for (int i = 0; i < argRowCnt; ++i) {
|
||||||
|
std::getline(infs, line);
|
||||||
|
split(line, dats[i]);
|
||||||
|
if (ReadGroupCovariate::RgToId.find(dats[i][0]) == ReadGroupCovariate::RgToId.end()) {
|
||||||
|
ReadGroupCovariate::RgToId[dats[i][0]] = rgId;
|
||||||
|
ReadGroupCovariate::IdToRg[rgId] = dats[i][0];
|
||||||
|
++rgId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recalTables.initReadGroupTable(rgId);
|
||||||
|
// spdlog::info("read group num: {}, {}", rgId, ReadGroupCovariate::IdToRg[0]);
|
||||||
|
for (int i = 0; i < dats.size(); ++i) {
|
||||||
|
int k2 = ReadGroupCovariate::RgToId[dats[i][0]];
|
||||||
|
int k1 = EventType::GetIndexForEventRep(dats[i][1][0]);
|
||||||
|
auto& datum = recalTables.readGroupTable(k1, k2);
|
||||||
|
datum.setEmpiricalQuality(ReportUtil::ToDouble(dats[i][2]));
|
||||||
|
datum.setReportedQuality(ReportUtil::ToDouble(dats[i][3]));
|
||||||
|
datum.setNumObservations(ReportUtil::ToUint64(dats[i][4]));
|
||||||
|
datum.setNumMismatches(ReportUtil::ToDouble(dats[i][5]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取质量分数表
|
||||||
|
static void readQualityScoreTable(std::ifstream& infs, RecalTables& recalTables) {
|
||||||
|
string line;
|
||||||
|
StableArray<string> dat;
|
||||||
|
std::getline(infs, line);
|
||||||
|
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||||
|
split(line, ':', dat);
|
||||||
|
int argRowCnt = std::stoi(dat[3]);
|
||||||
|
std::getline(infs, line); // 忽略表格说明的一行
|
||||||
|
std::getline(infs, line); // 忽略表头column说明
|
||||||
|
recalTables.initQualityScoreTable();
|
||||||
|
int rgId = 0;
|
||||||
|
for (int i = 0; i < argRowCnt; ++i) {
|
||||||
|
std::getline(infs, line);
|
||||||
|
split(line, dat);
|
||||||
|
int k2 = ReadGroupCovariate::RgToId[dat[0]];
|
||||||
|
int k3 = ReportUtil::ToInt(dat[1]);
|
||||||
|
int k1 = EventType::GetIndexForEventRep(dat[2][0]);
|
||||||
|
auto& datum = recalTables.qualityScoreTable(k1, k2, k3);
|
||||||
|
datum.setEmpiricalQuality(ReportUtil::ToDouble(dat[3]));
|
||||||
|
datum.setReportedQuality((double)k3);
|
||||||
|
datum.setNumObservations(ReportUtil::ToUint64(dat[4]));
|
||||||
|
datum.setNumMismatches(ReportUtil::ToDouble(dat[5]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取context和cycle表
|
||||||
|
static void readContextCycleTable(std::ifstream& infs, RecalTables& recalTables) {
|
||||||
|
string line;
|
||||||
|
StableArray<string> dat;
|
||||||
|
std::getline(infs, line);
|
||||||
|
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||||
|
split(line, ':', dat);
|
||||||
|
int argRowCnt = std::stoi(dat[3]);
|
||||||
|
std::getline(infs, line); // 忽略表格说明的一行
|
||||||
|
std::getline(infs, line); // 忽略表头column说明
|
||||||
|
recalTables.initContextTable();
|
||||||
|
recalTables.initCycleTable();
|
||||||
|
for (int i = 0; i < argRowCnt; ++i) {
|
||||||
|
std::getline(infs, line);
|
||||||
|
split(line, dat);
|
||||||
|
int k2 = ReadGroupCovariate::RgToId[dat[0]];
|
||||||
|
int k3 = ReportUtil::ToInt(dat[1]);
|
||||||
|
int k4 = 0;
|
||||||
|
int k1 = EventType::GetIndexForEventRep(dat[4][0]);
|
||||||
|
RecalDatum* dp;
|
||||||
|
if (dat[3] == "Cycle") {
|
||||||
|
k4 = CycleCovariate::KeyFromCycle(ReportUtil::ToInt(dat[2]));
|
||||||
|
dp = &recalTables.cycleTable(k1, k2, k3, k4);
|
||||||
|
} else if (dat[3] == "Context") {
|
||||||
|
k4 = ContextCovariate::KeyFromContext(dat[2]);
|
||||||
|
dp = &recalTables.contextTable(k1, k2, k3, k4);
|
||||||
|
} else {
|
||||||
|
spdlog::error("Unknown CovariateName {}", dat[3]);
|
||||||
|
}
|
||||||
|
dp->setEmpiricalQuality(ReportUtil::ToDouble(dat[5]));
|
||||||
|
dp->setReportedQuality((double)k3);
|
||||||
|
dp->setNumObservations(ReportUtil::ToUint64(dat[6]));
|
||||||
|
dp->setNumMismatches(ReportUtil::ToDouble(dat[7]));
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
206
src/main.cpp
206
src/main.cpp
|
|
@ -12,10 +12,7 @@
|
||||||
#include "fastbqsr_version.h"
|
#include "fastbqsr_version.h"
|
||||||
#include "bqsr/bqsr_args.h"
|
#include "bqsr/bqsr_args.h"
|
||||||
#include "util/profiling.h"
|
#include "util/profiling.h"
|
||||||
|
#include "bqsr/common_data.h"
|
||||||
namespace nsgv {
|
|
||||||
extern BQSRArg gBqsrArg;
|
|
||||||
};
|
|
||||||
|
|
||||||
int BaseRecalibrator();
|
int BaseRecalibrator();
|
||||||
int ApplyBQSR();
|
int ApplyBQSR();
|
||||||
|
|
@ -30,28 +27,18 @@ string get_current_time_str() {
|
||||||
return string(now);
|
return string(now);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main_BaseRecalibrator(int argc, char *argv[]) {
|
// 添加bqsr和apply bqsr的共同使用的参数
|
||||||
// init arg parser
|
static void addCommonArgs(argparse::ArgumentParser &program, const char *outputDesc) {
|
||||||
argparse::ArgumentParser program(nsgv::gBqsrArg.PROGRAM_RECORD_ID, FASTBQSR_VERSION, argparse::default_arguments::none);
|
|
||||||
program.add_description(
|
|
||||||
"First pass of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
|
|
||||||
"user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
|
|
||||||
|
|
||||||
program.add_argument("--input")
|
program.add_argument("--input")
|
||||||
.help("BAM/SAM/CRAM file containing reads This argument must be specified at least once.")
|
.help("BAM/SAM/CRAM file containing reads This argument must be specified at least once.")
|
||||||
.metavar("<INPUT>")
|
.metavar("<INPUT>")
|
||||||
.required();
|
.required();
|
||||||
|
|
||||||
program.add_argument("--output")
|
program.add_argument("--output")
|
||||||
.help("The output recalibration table file to create.")
|
.help(outputDesc)
|
||||||
.metavar("<OUTPUT>")
|
.metavar("<OUTPUT>")
|
||||||
.required();
|
.required();
|
||||||
|
|
||||||
program.add_argument("--reference")
|
|
||||||
.help("Reference sequence file.")
|
|
||||||
.metavar("<Reference>")
|
|
||||||
.required();
|
|
||||||
|
|
||||||
program.add_argument("--num-threads")
|
program.add_argument("--num-threads")
|
||||||
.help("Number of threads to use.")
|
.help("Number of threads to use.")
|
||||||
.scan<'i', int>()
|
.scan<'i', int>()
|
||||||
|
|
@ -59,6 +46,66 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||||
.nargs(1)
|
.nargs(1)
|
||||||
.metavar("<NUM_THREADS>");
|
.metavar("<NUM_THREADS>");
|
||||||
|
|
||||||
|
program.add_argument("--reference")
|
||||||
|
.help("Reference sequence file.")
|
||||||
|
.metavar("<Reference>")
|
||||||
|
.required();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 添加帮助和版本信息
|
||||||
|
static void addHelpVersion(argparse::ArgumentParser& program) {
|
||||||
|
program.add_argument("-h", "--help")
|
||||||
|
.action([&](const auto& /*unused*/) {
|
||||||
|
std::cout << program.help().str();
|
||||||
|
std::exit(0);
|
||||||
|
})
|
||||||
|
.default_value(false)
|
||||||
|
.help("shows help message and exits")
|
||||||
|
.implicit_value(true)
|
||||||
|
.nargs(0);
|
||||||
|
|
||||||
|
program.add_argument("-v", "--version")
|
||||||
|
.action([&](const auto& /*unused*/) {
|
||||||
|
std::cout << FASTBQSR_VERSION << std::endl;
|
||||||
|
std::exit(0);
|
||||||
|
})
|
||||||
|
.default_value(false)
|
||||||
|
.help("prints version information and exits")
|
||||||
|
.implicit_value(true)
|
||||||
|
.nargs(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 添加运行时间信息和命令行参数列表
|
||||||
|
static void addTimeCLI(int argc, char* argv[]) {
|
||||||
|
nsgv::gBqsrArg.START_TIME = get_current_time_str();
|
||||||
|
nsgv::gBqsrArg.CLI_STR = argv[0];
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
|
||||||
|
}
|
||||||
|
// spdlog::info("cmd: {}", nsgv::gBqsrArg.CLI_STR);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析公共参数
|
||||||
|
static void parseCommonArgs(argparse::ArgumentParser& program) {
|
||||||
|
nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
|
||||||
|
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
|
||||||
|
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
||||||
|
if (nsgv::gBqsrArg.NUM_THREADS < 1) {
|
||||||
|
spdlog::error("num-threads must be positive.");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||||
|
// init arg parser
|
||||||
|
argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
|
||||||
|
program.add_description(
|
||||||
|
"First phase of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
|
||||||
|
"user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
|
||||||
|
|
||||||
|
addCommonArgs(program, "The output recalibration table file to create.");
|
||||||
|
|
||||||
program.add_argument("--known-sites")
|
program.add_argument("--known-sites")
|
||||||
.help(
|
.help(
|
||||||
"One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from "
|
"One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from "
|
||||||
|
|
@ -66,6 +113,59 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||||
.metavar("<KnownSites>")
|
.metavar("<KnownSites>")
|
||||||
.nargs(argparse::nargs_pattern::any);
|
.nargs(argparse::nargs_pattern::any);
|
||||||
|
|
||||||
|
program.add_argument("--enable-baq")
|
||||||
|
.help("Whether to do BAQ correction.")
|
||||||
|
.default_value(false)
|
||||||
|
.implicit_value(true)
|
||||||
|
.hidden();
|
||||||
|
|
||||||
|
program.add_argument("--compute-indel-bqsr-tables")
|
||||||
|
.help("Whether to compute indel BQSR tables.")
|
||||||
|
.default_value(false)
|
||||||
|
.implicit_value(true)
|
||||||
|
.hidden();
|
||||||
|
|
||||||
|
// add help and version args
|
||||||
|
addHelpVersion(program);
|
||||||
|
|
||||||
|
try {
|
||||||
|
program.parse_args(argc, argv);
|
||||||
|
parseCommonArgs(program);
|
||||||
|
|
||||||
|
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
||||||
|
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
|
||||||
|
nsgv::gBqsrArg.computeIndelBQSRTables = program.get<bool>("--compute-indel-bqsr-tables");
|
||||||
|
// spdlog::info("known sites vcf files:");
|
||||||
|
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||||
|
// spdlog::info(" {}", ks);
|
||||||
|
// }
|
||||||
|
} catch (const std::exception &err) {
|
||||||
|
spdlog::error(err.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("fast base recalibration phase-1 start");
|
||||||
|
BaseRecalibrator();
|
||||||
|
spdlog::info("fast base recalibration phase-1 end");
|
||||||
|
|
||||||
|
DisplayProfilingBQSR(nsgv::gBqsrArg.NUM_THREADS);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main_ApplyBQSR(int argc, char* argv[]) {
|
||||||
|
// init arg parser
|
||||||
|
argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
|
||||||
|
program.add_description(
|
||||||
|
"Second phase of the Base Quality Score Recalibration (BQSR) -- Apply the bqsr table to generate recalibrated bam file.)");
|
||||||
|
|
||||||
|
addCommonArgs(program, "The output bam/sam file to create.");
|
||||||
|
|
||||||
|
program.add_argument("--bqsr")
|
||||||
|
.help("Input recalibration table for BQSR.")
|
||||||
|
.metavar("<BQSR>")
|
||||||
|
.required();
|
||||||
|
|
||||||
program.add_argument("--create-index")
|
program.add_argument("--create-index")
|
||||||
.help("Whether to create an index when writing coordinate sorted BAM output.")
|
.help("Whether to create an index when writing coordinate sorted BAM output.")
|
||||||
.default_value(false)
|
.default_value(false)
|
||||||
|
|
@ -78,83 +178,37 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||||
.nargs(1)
|
.nargs(1)
|
||||||
.metavar("<IndexFormat>");
|
.metavar("<IndexFormat>");
|
||||||
|
|
||||||
program.add_argument("--enable-baq")
|
addHelpVersion(program);
|
||||||
.help("Whether to do BAQ correction.")
|
|
||||||
.default_value(false)
|
|
||||||
.implicit_value(true)
|
|
||||||
.hidden();
|
|
||||||
|
|
||||||
// add help and version args
|
|
||||||
program.add_argument("-h", "--help")
|
|
||||||
.action([&](const auto & /*unused*/) {
|
|
||||||
std::cout << program.help().str();
|
|
||||||
std::exit(0);
|
|
||||||
})
|
|
||||||
.default_value(false)
|
|
||||||
.help("shows help message and exits")
|
|
||||||
.implicit_value(true)
|
|
||||||
.nargs(0);
|
|
||||||
|
|
||||||
program.add_argument("-v", "--version")
|
|
||||||
.action([&](const auto & /*unused*/) {
|
|
||||||
std::cout << FASTBQSR_VERSION << std::endl;
|
|
||||||
std::exit(0);
|
|
||||||
})
|
|
||||||
.default_value(false)
|
|
||||||
.help("prints version information and exits")
|
|
||||||
.implicit_value(true)
|
|
||||||
.nargs(0);
|
|
||||||
|
|
||||||
// std::cout << program << std::endl;
|
|
||||||
|
|
||||||
nsgv::gBqsrArg.START_TIME = get_current_time_str();
|
|
||||||
nsgv::gBqsrArg.CLI_STR = argv[0];
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
|
||||||
nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
program.parse_args(argc, argv);
|
program.parse_args(argc, argv);
|
||||||
nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
|
parseCommonArgs(program);
|
||||||
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
|
nsgv::gBqsrArg.BQSR_FILE = program.get("--bqsr");
|
||||||
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
|
||||||
if (nsgv::gBqsrArg.NUM_THREADS < 1) {
|
|
||||||
spdlog::error("num-threads must be positive.");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
|
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
|
||||||
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
|
string idxFormat = program.get("--index-format");
|
||||||
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
std::transform(idxFormat.begin(), idxFormat.end(), idxFormat.begin(), ::toupper);
|
||||||
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
|
nsgv::gBqsrArg.INDEX_FORMAT = idxFormat == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
|
||||||
// spdlog::info("known sites vcf files:");
|
|
||||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
|
||||||
// spdlog::info(" {}", ks);
|
|
||||||
// }
|
|
||||||
|
|
||||||
nsgv::gBqsrArg.INDEX_FORMAT =
|
} catch (const std::exception& err) {
|
||||||
program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
|
|
||||||
|
|
||||||
} catch (const std::exception &err) {
|
|
||||||
spdlog::error(err.what());
|
spdlog::error(err.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::info("fast base recalibration phase-1 start");
|
spdlog::info("fast base recalibration phase-2 start");
|
||||||
BaseRecalibrator();
|
ApplyBQSR();
|
||||||
spdlog::info("fast base recalibration phase-1 end");
|
spdlog::info("fast base recalibration phase-2 end");
|
||||||
|
|
||||||
DisplayProfiling(nsgv::gBqsrArg.NUM_THREADS);
|
// DisplayProfilingApplyBQSR(nsgv::gBqsrArg.NUM_THREADS);
|
||||||
|
DisplayProfilingApplyBQSR(1);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main_ApplyBQSR(int argc, char* argv[]) { return 0; }
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
// init log
|
// init log
|
||||||
spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr"));
|
spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr"));
|
||||||
spdlog::cfg::load_env_levels();
|
spdlog::cfg::load_env_levels();
|
||||||
|
|
||||||
|
addTimeCLI(argc, argv);
|
||||||
string bqsr_prog = argv[1];
|
string bqsr_prog = argv[1];
|
||||||
if (bqsr_prog == "BaseRecalibrator") {
|
if (bqsr_prog == "BaseRecalibrator") {
|
||||||
return main_BaseRecalibrator(argc - 1, argv + 1);
|
return main_BaseRecalibrator(argc - 1, argv + 1);
|
||||||
|
|
|
||||||
|
|
@ -45,8 +45,7 @@ static int CalcThreadTime(uint64_t *a, int len, double *max, double *min, double
|
||||||
fprintf(stderr, "time %-15s: avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \
|
fprintf(stderr, "time %-15s: avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \
|
||||||
}
|
}
|
||||||
|
|
||||||
int DisplayProfiling(int nthread) {
|
int DisplayProfilingBQSR(int nthread) {
|
||||||
|
|
||||||
#ifdef SHOW_PERF
|
#ifdef SHOW_PERF
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
PRINT_GP(GP_read);
|
PRINT_GP(GP_read);
|
||||||
|
|
@ -80,6 +79,27 @@ int DisplayProfiling(int nthread) {
|
||||||
}
|
}
|
||||||
PRINT_GP(GP_whole_process);
|
PRINT_GP(GP_whole_process);
|
||||||
|
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DisplayProfilingApplyBQSR(int nthread) {
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
PRINT_GP(GP_read);
|
||||||
|
if (nthread == 1) {
|
||||||
|
PRINT_GP(GP_covariate);
|
||||||
|
} else {
|
||||||
|
PRINT_TP(TP_covariate);
|
||||||
|
PRINT_TP(TP_readgroup);
|
||||||
|
PRINT_TP(TP_qualityscore);
|
||||||
|
PRINT_TP(TP_context);
|
||||||
|
PRINT_TP(TP_cycle);
|
||||||
|
}
|
||||||
|
PRINT_GP(GP_whole_process);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,8 @@ enum {
|
||||||
|
|
||||||
uint64_t RealtimeMsec(void);
|
uint64_t RealtimeMsec(void);
|
||||||
|
|
||||||
int DisplayProfiling(int);
|
int DisplayProfilingBQSR(int);
|
||||||
|
int DisplayProfilingApplyBQSR(int);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,24 +22,46 @@ using std::string;
|
||||||
using std::stringstream;
|
using std::stringstream;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
#define REPORT_HEADER_VERSION "#:GATKReport.v1.1:5"
|
#define REPORT_HEADER_VERSION "#:GATKReport.v1.1"
|
||||||
|
#define REPORT_HEADER_MINOR_VERSION "5"
|
||||||
|
|
||||||
struct ReportUtil {
|
struct ReportUtil {
|
||||||
static string ToString(const bool val) { return val ? "true" : "false"; }
|
static inline string ToString(const bool val) { return val ? "true" : "false"; }
|
||||||
static string ToString(const char val) {
|
static inline string ToString(const char val) {
|
||||||
string s = "";
|
string s = "";
|
||||||
s += val;
|
s += val;
|
||||||
// spdlog::info("char: {}, str: {}", val, s);
|
// spdlog::info("char: {}, str: {}", val, s);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
static string ToString(const string& val) { return val == "" ? "null" : val; }
|
static inline string ToString(const string& val) { return val == "" ? "null" : val; }
|
||||||
static string ToString(const double val, int precise) {
|
static inline string ToString(const double val, int precise) {
|
||||||
stringstream ss;
|
stringstream ss;
|
||||||
ss << std::fixed << std::setprecision(precise) << val;
|
ss << std::fixed << std::setprecision(precise) << val;
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static string ToString(const T val) { return std::to_string(val); }
|
static inline string ToString(const T val) { return std::to_string(val); }
|
||||||
|
|
||||||
|
// 转换成bool
|
||||||
|
static inline bool ToBool(const string &val) {
|
||||||
|
if (val == "false" || val == "False" || val == "FALSE" || val.empty())
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline string ParseString(const string &val) {
|
||||||
|
if (val == "null")
|
||||||
|
return "";
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ToInt(const string& val) { return std::stoi(val); }
|
||||||
|
|
||||||
|
static inline double ToDouble(const string& val) { return std::stod(val); }
|
||||||
|
|
||||||
|
static inline int64_t ToInt64(const string& val) { return std::stoll(val); }
|
||||||
|
|
||||||
|
static inline uint64_t ToUint64(const string& val) { return std::stoull(val); }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ReportTable {
|
struct ReportTable {
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,8 @@ struct SamData {
|
||||||
StableArray<uint8_t> base_quals; // 对应的质量分数
|
StableArray<uint8_t> base_quals; // 对应的质量分数
|
||||||
StableArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
|
StableArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
|
||||||
StableArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
|
StableArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
|
||||||
StableArray<Cigar> cigars;
|
StableArray<Cigar> cigars; // clip 之后的cigar
|
||||||
|
StableArray<uint8_t> recaled_quals; // 保存校正之后的质量分数
|
||||||
|
|
||||||
// 用作临时buffer
|
// 用作临时buffer
|
||||||
StableArray<char> strandedClippedBases; // for context covariate
|
StableArray<char> strandedClippedBases; // for context covariate
|
||||||
|
|
@ -87,6 +88,20 @@ struct SamData {
|
||||||
end_pos = 0;
|
end_pos = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 解析apply bqsr里用到的信息
|
||||||
|
void parseForApplyBQSR(BamWrap *_bw) {
|
||||||
|
bw = _bw;
|
||||||
|
read_len = bw->b->core.l_qseq;
|
||||||
|
bases.resize(read_len);
|
||||||
|
base_quals.resize(read_len);
|
||||||
|
uint8_t* seq = bam_get_seq(bw->b);
|
||||||
|
uint8_t* quals = bam_get_qual(bw->b);
|
||||||
|
for (int i = 0; i < read_len; ++i) {
|
||||||
|
bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i)];
|
||||||
|
base_quals[i] = quals[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 初步解析bam
|
// 初步解析bam
|
||||||
void parseBasic(BamWrap *_bw) {
|
void parseBasic(BamWrap *_bw) {
|
||||||
bw = _bw;
|
bw = _bw;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue