实现了apply bqsr的单线程版本,结果还有点错误,继续调试

This commit is contained in:
zzh 2026-01-03 22:10:46 +08:00
parent 985875ebac
commit 94e06338cd
19 changed files with 1147 additions and 209 deletions

View File

@ -6,5 +6,5 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
# set(CMAKE_BUILD_TYPE Debug)
# set(CMAKE_BUILD_TYPE Release)
add_definitions(-DSHOW_PERF)
#add_definitions(-DSHOW_PERF=1)
# add_definitions(-DSHOW_PERF=1)
add_subdirectory(src)

View File

@ -2,4 +2,6 @@
对GATK bqsr部分进行并行优化
对应的GATK版本The Genome Analysis Toolkit (GATK) v4.6.2.0-17-g2a1f41b-SNAPSHOT
Implement the same functions as GATK for BaseRecalibrator and ApplyBQSR

View File

@ -0,0 +1,336 @@
// 一. 读取并解析recalibrate tables
// 二. 循环处理每一个read
// 1. 计算read的协变量covs
// 2. 根据read的协变量信息从recal tables获取对应的数据
// 3. 根据recal tables数据对read的每个碱基分别计算新的质量分数
// 4. 将计算后的质量分数赋值给read
#include <header.h> // in htslib
#include <htslib/sam.h>
#include <htslib/thread_pool.h>
#include "aux_arg.h"
#include "common_data.h"
#include "util/debug.h"
#include "util/profiling.h"
#include "recal_utils.h"
#include "recal_funcs.h"
#include "read_utils.h"
using std::vector;
namespace nsgv {
// 全局变量 for apply bqsr
samFile* gOutBamFp; // 输出文件, sam或者bam格式
sam_hdr_t* gOutBamHeader; // 输出文件的header
RecalTables gRecalTables; // 保留一个全局的recalibrate tables就行了
vector<uint8_t> gQuantizedQuals; // 读取quantized info table信息得到的第三列数据
StableArray<bam1_t*> gRecalBams[2]; // 保存已经校正过质量分数的bam数据双buffer
}; // namespace nsgv
// 串行apply bqsr
int SerialApplyBQSR(AuxVar &aux) {
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::applyBqsrReadFilterOut);
int64_t readNumSum = 0;
int round = 0;
PerReadCovariateMatrix& readCovariates = aux.readCovariates;
// 一. 读取并解析recalibrate tables
auto& recalTables = *aux.bqsrTable;
// 全局的校正后的bam数组
auto& recalBams = nsgv::gRecalBams[0];
auto& sd = aux.sd;
while (true) {
++round;
// 一. 读取bam数据
size_t readNum = 0;
PROF_START(GP_read);
if (inBamBuf.ReadStat() >= 0)
readNum = inBamBuf.ReadBam();
PROF_GP_END(GP_read);
if (readNum < 1) {
break;
}
auto bams = inBamBuf.GetBamArr();
spdlog::info("{} reads processed in {} round", readNum, round);
if (recalBams.size() < bams.size()) {
int start = recalBams.size();
recalBams.resize(bams.size());
for (int i = start; i < recalBams.size(); ++i) {
recalBams[i] = bam_init1();
}
}
// 二. 遍历每个bamread记录进行处理
for (int i = 0; i < bams.size(); ++i) {
if (bam_copy1(recalBams[i], bams[i]->b) == NULL) {
spdlog::error("Copy bam error");
exit(1);
}
BamWrap* bw = bams[i];
bw->b = recalBams[i]; // 注意这里的赋值然后就可以对b进行更改了
sd.init();
sd.parseForApplyBQSR(bw);
sd.rid = i + aux.processedReads;
// 1. 是否使用original quality来代替当前的quality
if (nsgv::gBqsrArg.useOriginalBaseQualities)
ReadUtils::resetOriginalBaseQualities(bw->b);
// 2. 是否将当前的quality保存在tag OQ中
if (nsgv::gBqsrArg.emitOriginalQuals)
ReadUtils::setOriginalBaseQualsIfNoOQ(bw->b);
// 3. 计算read的协变量covs
PROF_START(GP_covariate);
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
PROF_GP_END(GP_covariate);
// clear indel qualities
ReadUtils::removeAttribute(bw->b, "BI");
ReadUtils::removeAttribute(bw->b, "BD");
// 4. 检查read的readGroup tag是否包含在bqsr table里
const char* readGroupId = ReadUtils::getReadGroupId(bw->b);
auto& covaritesForRead = readCovariates[EventType::BASE_SUBSTITUTION.index];
uint8_t* recalibratedQuals = bam_get_qual(bw->b);
auto& preUpdateQuals = sd.base_quals;
int rgKey = -1;
if (ReadGroupCovariate::RgToId.find(std::string(readGroupId)) != ReadGroupCovariate::RgToId.end())
rgKey = ReadGroupCovariate::RgToId[std::string(readGroupId)];
if (rgKey == -1) {
if (nsgv::gBqsrArg.allowMissingReadGroups) {
// Given the way the recalibration code is implemented below, we cannot recalibrate a read with a
// read group that's not in the recal table.
for (int i = 0; i < sd.read_len; i++) {
//recalibratedQuals[i] = staticQuantizedMapping != null ? staticQuantizedMapping[preUpdateQuals[i]] : quantizedQuals.get(preUpdateQuals[i]);
recalibratedQuals[i] = nsgv::gQuantizedQuals[preUpdateQuals[i]];
}
} else {
spdlog::error(
"Read group {} not found in the recalibration table. Use \"--allow-missing-read-group\" command line argument to ignore this "
"error.",
readGroupId);
exit(1);
}
}
// 5. 根据recal tables数据对read的每个碱基分别计算新的质量分数
auto& readGroupDatum = recalTables.readGroupTable(EventType::BASE_SUBSTITUTION.index, rgKey);
// Note: this loop is under very heavy use in applyBQSR. Keep it slim.
for (int offset = 0; offset < sd.read_len; offset++) { // recalibrate all bases in the read
// only recalibrate usable qualities (default: >= 6) (the original quality will come from the instrument -- reported quality)
if (recalibratedQuals[offset] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN) {
continue;
}
auto& covs = readCovariates[EventType::BASE_SUBSTITUTION.index][offset];
// 根据read的协变量数据获取对应的bqsr数据
auto& qualityScoreDatum = recalTables.qualityScoreTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality);
auto& contextDatum = recalTables.contextTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.context);
auto& cycleDatum = recalTables.cycleTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.cycle);
// 计算校正后的质量分数
double priorQualityScore =
nsgv::gBqsrArg.globalQScorePrior > 0.0 ? nsgv::gBqsrArg.globalQScorePrior : readGroupDatum.getReportedQuality();
double rawRecalibratedQualityScore =
RecalFuncs::hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, contextDatum, cycleDatum);
uint8_t qualIdx = RecalFuncs::getBoundedIntegerQual(rawRecalibratedQualityScore);
uint8_t quantizedQualityScore = qualIdx; // nsgv::gQuantizedQuals.at(qualIdx);
// TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be
// quantized once.
// recalibratedQuals[offset] = staticQuantizedMapping == null ? quantizedQualityScore : staticQuantizedMapping[quantizedQualityScore];
recalibratedQuals[offset] = quantizedQualityScore;
}
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str());
sam_close(nsgv::gInBamFp);
sam_close(nsgv::gOutBamFp);
exit(1);
}
//break;
}
// break;
readNumSum += readNum;
AuxVar::processedReads += readNum;
inBamBuf.ClearAll(); // 清空bam buf
}
spdlog::info("read count: {}", readNumSum);
return 0;
}
// 并行 apply bqsr
int ParallelApplyBQSR(vector<AuxVar> &auxArr) {
return 0;
}
// 在进行数据处理之前,初始化一些全局数据
static void globalInit() {
open_debug_files();
/* bam */
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
if (!nsgv::gInBamFp) {
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
exit(1);
}
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
/* 并行读取bam数据 */
htsThreadPool htsPoolRead = {NULL, 0}; //
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
htsPoolRead.pool = hts_tpool_init(readThreadNum);
if (!htsPoolRead.pool) {
spdlog::error("[{}] failed to set up thread pool", __LINE__);
sam_close(nsgv::gInBamFp);
exit(1);
}
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
if (!nsgv::gInBamHeader->hrecs) {
if (sam_hdr_fill_hrecs(nsgv::gInBamHeader) != 0) {
spdlog::error("[{}] failed to read sam header", __LINE__);
sam_close(nsgv::gInBamFp);
exit(1);
}
}
/* 并行写入bam文件 */
char modeout[12] = "wb";
sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL);
nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout);
if (!nsgv::gOutBamFp) {
spdlog::error("[{}] create output sam/bam file failed.\n", __func__);
exit(1);
}
nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader);
// 修改输出文件的header
sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", PROGRAM_NAME, "VN", FASTBQSR_VERSION, "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL);
// 用不同的线程数量处理输出文件
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
int writeThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_WRITE_MAX_THREAD);
htsPoolWrite.pool = hts_tpool_init(writeThreadNum);
if (!htsPoolWrite.pool) {
spdlog::error("[{}] failed to set up thread pool", __LINE__);
sam_close(nsgv::gInBamFp);
exit(1);
}
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) {
spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE);
sam_close(nsgv::gOutBamFp);
sam_close(nsgv::gInBamFp);
exit(1);
}
// 输出index文件
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 是bai格式
if ("sam" == Utils::getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) {
nsgv::gBqsrArg.INDEX_FILE = "";
}
if (!nsgv::gBqsrArg.INDEX_FILE.empty()) {
int index_min_shift = 0;
if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) {
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".csi";
index_min_shift = 14;
}
if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, nsgv::gBqsrArg.INDEX_FILE.c_str()) < 0) {
spdlog::error("failed to open index \"{}\" for writing", nsgv::gBqsrArg.INDEX_FILE);
sam_close(nsgv::gOutBamFp);
sam_close(nsgv::gInBamFp);
exit(1);
}
}
// 1. 协变量数据相关初始化
ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
// 注意初始化顺序这个必须在协变量初始化之后因为需要用到MaximumKeyValue
// nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg);
// 初始化AuxVar
nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
if (nsgv::gAuxVars[i].faidx == 0)
error("[%s] fail to load the fasta index.\n", __func__);
// 注意初始化顺序这个必须在协变量初始化之后因为需要用到MaximumKeyValue
nsgv::gAuxVars[i].bqsrTable = &nsgv::gRecalTables;
CovariateUtils::InitPerReadCovMat(nsgv::gAuxVars[i].readCovariates);
}
// 0. 初始化一些全局数据
RecalDatum::StaticInit();
QualityUtils::StaticInit();
MathUtils::StaticInit();
BaseUtils::StaticInit();
// 初始化需要计算的event types
RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables);
// 2. 读取bam的read group
// if (nsgv::gInBamHeader->hrecs->nrg == 0) {
// spdlog::error("No RG tag found in the header!");
// exit(1);
// }
/*
// 应该是从bqsr table里读取read group信息
for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
// spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
}
*/
// 读取并解析recalibrate tables
RecalUtils::readRecalTables(nsgv::gBqsrArg.BQSR_FILE, nsgv::gBqsrArg, nsgv::gQuantizedQuals, nsgv::gRecalTables);
}
// 全局资源释放
static void globalDestroy() {
close_debug_files();
if (!nsgv::gBqsrArg.INDEX_FILE.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) {
spdlog::error("writing index failed");
sam_close(nsgv::gOutBamFp);
sam_close(nsgv::gInBamFp);
exit(1);
}
/* 关闭文件,收尾清理 */
sam_close(nsgv::gOutBamFp);
sam_close(nsgv::gInBamFp);
}
// entrance of BQSR phase-1
int ApplyBQSR() {
int ret = 0;
PROF_START(GP_whole_process);
globalInit();
// if (nsgv::gBqsrArg.NUM_THREADS == 1)
ret = SerialApplyBQSR(nsgv::gAuxVars[0]); // 串行处理数据生成recal bams
// else
// ret = ParallelApplyBQSR(nsgv::gAuxVars); // 并行处理数据生成recal bams
globalDestroy();
PROF_GP_END(GP_whole_process);
return ret;
}

View File

@ -43,11 +43,14 @@ struct AuxVar {
vector<VCFParser> vcfArr; // 从vcf中获取已知位点
PerReadCovariateMatrix readCovariates; // 每个read对应的协变量矩阵
RecalTables recalTables; // 每个线程对应一个recal tables
RecalTables recalTables; // 每个线程对应一个recal tables计算table的时候需要每个线程都有一个输出bam的时候不需要因为只读取不写入
SamData sd;
StableArray<int> isSNP, isIns, isDel; // 该位置是否是SNP, indel位置0不是1是
StableArray<uint8_t> baqArray;
StableArray<double> snpErrors, insErrors, delErrors;
StableArray<uint8_t> skips; // 该位置是否是已知位点
// only for apply bqsr
RecalTables* bqsrTable; // 保留一份就够
};

View File

@ -11,6 +11,8 @@ Date : 2025/10/10
#include <string>
#include <vector>
#include "qual_utils.h"
using std::string;
using std::vector;
@ -36,19 +38,19 @@ struct BQSRArg {
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
bool CREATE_INDEX = true;
string INDEX_FILE;
nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI;
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
bool ADD_PG_TAG_TO_READS = true;
//
// 命令行字符串
string CLI_STR;
//
// 开始运行时间
string START_TIME;
string PROGRAM_RECORD_ID = "FastBQSR";
// reference file
string REFERENCE_FILE;
@ -137,14 +139,6 @@ struct BQSRArg {
*/
double BAQGOP = 40;
/**
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in
* the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since base callers use these
* values to indicate random or bad bases. For example, Illumina writes Q2 bases when the machine has really gone wrong.
* This would be fine in and of itself, but when you select a subset of these reads based on their ability to align to the
* reference and their dinucleotide effect, your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
*/
int PRESERVE_QSCORES_LESS_THAN = 6;
/**
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了用不到了
@ -152,7 +146,7 @@ struct BQSRArg {
bool enableBAQ = false;
/**
* compute-indel-bqsr-tables, compute indel BQSR tables"
* compute-indel-bqsr-tables, compute indel BQSR tables" hidden
*/
bool computeIndelBQSRTables = false;
@ -162,16 +156,93 @@ struct BQSRArg {
//
// --------------------------------------------------------------------------------------------------------------
/**
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
* tag is present for a read, the standard qual score will be used.
*/
bool useOriginalBaseQualities = false;
/**
* If reads are missing some or all base quality scores, this value will be used for all base quality scores.
* By default this is set to -1 to disable default base quality assignment.
*/
int8_t defaultBaseQualities = -1;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// args for apply bqsr
string BQSR_FILE;
/**
* Turns on the base quantization module. It requires a recalibration report.
*
* A value of 0 here means "do not quantize".
* Any value greater than zero will be used to recalculate the quantization using that many levels.
* Negative values mean that we should quantize using the recalibration report's quantization level.
*/
// @Argument(fullName = "quantize-quals", doc = "Quantize quality scores to a given number of levels", optional = true)
int quantizationLevels = 0;
/**
* Static quantized quals are entirely separate from the quantize_qual option which uses dynamic binning.
* The two types of binning should not be used together.
*
* For example, the Warp germline pipeline uses the static bins { 10, 20, 30, 40 }
*/
//@Advanced @Argument(fullName = static-quantized-quals,
// doc = "Use static quantized quality scores to a given number of levels (with -" +
// StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME + ")", optional = true, mutex = "quantize-quals")
vector<int> staticQuantizationQuals;
/**
* Round down quantized only works with the static_quantized_quals option, and should not be used with
* the dynamic binning option provided by quantize_quals. When roundDown = false, rounding is done in
* probability space to the nearest bin. When roundDown = true, the value is rounded to the nearest bin
* that is smaller than the current bin.
*/
// @Advanced @Argument(fullName = "round-down-quantized", doc = "Round quals down to nearest quantized qual", optional = true,
// mutex = "quantize-quals") public
bool roundDown = false;
/**
* The tool is capable of writing out the original quality scores of each read in the recalibrated output file
* under the "OQ" tag. By default, this behavior is disabled because emitting original qualities results in a
* significant increase of the file size. Use this flag to turn on emission of original qualities.
*/
//@Argument(fullName = "emit-original-quals", doc = "Emit original base qualities under the OQ tag",
// optional = true)
bool emitOriginalQuals = false;
/**
* If specified, the value of this argument will be used as a flat prior for all mismatching quality scores instead
* of the reported quality score (assigned by the sequencer).
*/
// @Argument(fullName = "global-qscore-prior", doc = "Global Qscore Bayesian prior to use for BQSR",
// optional = true)
double globalQScorePrior = -1.0;
/**
* If set to true, do not throw an error upon encountering a read with a read group that's not in the recalibration table.
* Instead, simply set the quantized original base qualities as the recalibrated base qualities.
*/
// @Argument(fullName = allow-missing-read-group, doc = "Do not throw an error when encountering a read group not in the recal table",
// optional = true)
bool allowMissingReadGroups = false;
/**
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out
* unmodified in the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since
* base callers use these values to indicate random or bad bases. For example, Illumina writes Q2 bases when the
* machine has really gone wrong. This would be fine in and of itself, but when you select a subset of these reads
* based on their ability to align to the reference and their dinucleotide effect, your Q2 bin can be elevated to
* Q8 or Q10, leading to issues downstream.
*/
// @Argument(fullName = "preserve-qscores-less-than", doc = "Don't recalibrate bases with quality scores less than this threshold", optional = true,
// minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE)
int PRESERVE_QSCORES_LESS_THAN = QualityUtils::MIN_USABLE_Q_SCORE;
/**
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
* tag is present for a read, the standard quality score will be used.
*/
// @Argument(fullName = use-original-qualities, shortName = "OQ", doc = "Use the base quality scores from the OQ tag",
// optional = true)
bool useOriginalBaseQualities = false;
};

View File

@ -1,13 +1,12 @@
/*
Description:
bambambam
Description: bqsr
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/10/23
Date : 2025/11/23
*/
#include <header.h>
#include <header.h> // in htslib
#include <htslib/faidx.h>
#include <htslib/kstring.h>
#include <htslib/sam.h>
@ -44,11 +43,10 @@ Date : 2023/10/23
#include "util/stable_array.h"
#include "util/utils.h"
#include "util/vcf_parser.h"
#include "common_data.h"
using std::deque;
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
namespace nsgv {
// 全局变量 for bqsr
@ -58,29 +56,6 @@ sam_hdr_t* gInBamHeader; // input BAM header
vector<AuxVar> gAuxVars; // auxiliary variables保存一些文件数据等每个线程对应一个
}; // namespace nsgv
// 过滤掉bqsr过程不符合要求的bam数据
bool bqsrReadFilterOut(const bam1_t *b) {
// 过滤掉unmapped的read
if (b->core.qual == 0) // mapping quality 0
return true;
if (b->core.qual == 255) // mapping quality not available
return true;
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
return true;
}
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
return true;
}
if (b->core.flag & BAM_FDUP) { // secondary alignment
return true;
}
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
return true;
}
return false;
}
// 数据总结
void collapseQualityScoreTableToReadGroupTable(Array2D<RecalDatum> &byReadGroupTable, Array3D<RecalDatum> &byQualTable) {
// 遍历quality table
@ -137,7 +112,7 @@ static void printRecalTables(const RecalTables& rt) {
// 串行bqsr
int SerialBQSR(AuxVar &aux) {
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
int64_t readNumSum = 0;
int round = 0;
@ -227,7 +202,7 @@ int SerialBQSR(AuxVar &aux) {
// 8. 计算这条read对应的协变量
PROF_START(GP_covariate);
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, 0);
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
PROF_GP_END(GP_covariate);
// fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len);
@ -251,7 +226,7 @@ int SerialBQSR(AuxVar &aux) {
// 9. 计算这条read需要跳过的位置
PROF_START(GP_read_vcf);
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, 0);
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, RecalFuncs::MAX_SITES_INTERVAL, skips, 0);
for (int ii = 0; ii < sd.read_len; ++ii) {
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
@ -336,6 +311,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size());
#endif
// spdlog::info("tid {}, index {}, steal {}", tid, idx, steal);
// spdlog::info("interval span: {}", bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos());
int sitesStride = bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos();
// sitesStride = sitesStride >= RecalFuncs::MAX_SITES_INTERVAL ? sitesStride : RecalFuncs::MAX_SITES_INTERVAL;
sitesStride = RecalFuncs::MAX_SITES_INTERVAL;
aux.threadProcessedReads += stopIdx - startIdx;
for (int i = startIdx; i < stopIdx; ++i) {
// spdlog::info("Thread {} processing read idx: {}", tid, i);
@ -364,10 +343,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
if (!baqCalculated) continue;
PROF_START(TP_covariate);
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, thid);
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, thid);
PROF_TP_END(TP_covariate);
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, thid);
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, sitesStride, skips, thid);
for (int ii = 0; ii < sd.read_len; ++ii) {
skips[ii] =
skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
@ -390,7 +369,7 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
// 并行bqsr
int ParallelBQSR(vector<AuxVar>& auxArr) {
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
int64_t readNumSum = 0;
int round = 0;
@ -484,8 +463,8 @@ static void globalInit() {
/* 并行读取bam数据 */
htsThreadPool htsPoolRead = {NULL, 0}; //
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, 8);
htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
htsPoolRead.pool = hts_tpool_init(readThreadNum);
if (!htsPoolRead.pool ) {
spdlog::error("[{}] failed to set up thread pool", __LINE__);
sam_close(nsgv::gInBamFp);
@ -549,9 +528,10 @@ static void globalInit() {
// 全局资源释放
static void globalDestroy() {
close_debug_files();
sam_close(nsgv::gInBamFp);
}
// entrance of mark BQSR
// entrance of BQSR phase-1
int BaseRecalibrator() {
int ret = 0;
@ -562,7 +542,6 @@ int BaseRecalibrator() {
else
ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据生成recal table
globalDestroy();
sam_close(nsgv::gInBamFp);
PROF_GP_END(GP_whole_process);
return ret;

View File

@ -0,0 +1,30 @@
/*
Description:
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2026/01/01
*/
#pragma once
#include "bqsr_args.h"
#include <htslib/sam.h>
#include "aux_arg.h"
#include "fastbqsr_version.h"
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
#define BAM_READ_MAX_THREAD 8
#define BAM_WRITE_MAX_THREAD 16
#define PROGRAM_NAME "FastBQSR"
namespace nsgv {
// 全局变量 for bqsr
extern BQSRArg gBqsrArg; // bqsr arguments
extern samFile* gInBamFp; // input BAM file pointer
extern sam_hdr_t* gInBamHeader; // input BAM header
extern vector<AuxVar> gAuxVars; // auxiliary variables保存一些文件数据等每个线程对应一个
}; // namespace nsgv

View File

@ -250,19 +250,20 @@ void ContextCovariate::GetStrandedClippedBytes(SamData& sd, StableArray<char>& c
* @param end the end position in the array (exclusive)
* @return the key representing the dna sequence
*/
int ContextCovariate::KeyFromContext(const StableArray<char>& dna, const int start, const int end) {
int key = end - start;
int bitOffset = LENGTH_BITS;
for (int i = start; i < end; i++) {
const int baseIndex = baseIndexMap[dna[i] & 0xff];
if (baseIndex == -1) { // ignore non-ACGT bases
return -1;
}
key |= (baseIndex << bitOffset);
bitOffset += 2;
}
return key;
}
// template<typename Arr> // 好像是类的模板函数必须在头文件里实现
// int ContextCovariate::KeyFromContext(const Arr& dna, const int start, const int end) {
// int key = end - start;
// int bitOffset = LENGTH_BITS;
// for (int i = start; i < end; i++) {
// const int baseIndex = baseIndexMap[dna[i] & 0xff];
// if (baseIndex == -1) { // ignore non-ACGT bases
// return -1;
// }
// key |= (baseIndex << bitOffset);
// bitOffset += 2;
// }
// return key;
// }
/**
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
@ -334,6 +335,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const StableArray<char>& bas
void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int originalReadLength = sd.read_len;
const char* qname = bam_get_qname(sd.bw->b);
// store the original bases and then write Ns over low quality ones
auto &strandedClippedBases = sd.strandedClippedBases;
strandedClippedBases.copy(sd.bases);
@ -348,13 +350,13 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
// since the context covariate may not span the entire set of values in read covariates
// due to the clipping of the low quality bases
// 这段代码应该不会执行因为clip with N不会改变read长度
if (readLengthAfterClipping != originalReadLength) {
// don't bother zeroing out if we are going to overwrite the whole array
for (int i = 0; i < originalReadLength; i++) {
// this base has been clipped off, so zero out the covariate values here
CovariateUtils::SetContext(0, 0, 0, i, values);
}
}
// if (readLengthAfterClipping != originalReadLength) {
// // don't bother zeroing out if we are going to overwrite the whole array
// for (int i = 0; i < originalReadLength; i++) {
// // this base has been clipped off, so zero out the covariate values here
// CovariateUtils::SetContext(0, 0, 0, i, values);
// }
// }
const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag();
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration

View File

@ -27,7 +27,7 @@ using std::vector;
// 协变量的值, 4个协变量
struct CovariateValues {
int readGroup = 0;
int readGroup = 0; // 默认是read group 0
int baseQuality = 0;
int context = -1;
int cycle = -1;
@ -61,6 +61,16 @@ struct EventType {
static EventTypeValue BASE_INSERTION;
static EventTypeValue BASE_DELETION;
static vector<EventTypeValue> EVENTS;
static int GetIndexForEventRep(char eventType) {
if (eventType == BASE_SUBSTITUTION.representation) {
return BASE_SUBSTITUTION.index;
} else if (eventType == BASE_INSERTION.representation) {
return BASE_DELETION.index;
} else if (eventType == BASE_DELETION.representation) {
return BASE_DELETION.index;
}
return 0;
}
};
// Read group协变量
@ -201,7 +211,22 @@ struct ContextCovariate {
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
// Creates a int representation of a given dna string.
static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
template <typename Arr>
static int KeyFromContext(const Arr& dna, const int start, const int end) {
int key = end - start;
int bitOffset = LENGTH_BITS;
for (int i = start; i < end; i++) {
const int baseIndex = baseIndexMap[dna[i] & 0xff];
if (baseIndex == -1) { // ignore non-ACGT bases
return -1;
}
key |= (baseIndex << bitOffset);
bitOffset += 2;
}
return key;
}
static int KeyFromContext(const string& dna) { return KeyFromContext(dna, 0, dna.size()); }
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);
@ -219,6 +244,7 @@ struct CycleCovariate {
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
static int KeyFromCycle(const int cycle) { return KeyFromCycle(cycle, MAXIMUM_CYCLE_VALUE); }
/**
* Encodes the cycle number as a key.
*/

View File

@ -0,0 +1,71 @@
/*
Description: apply bqsrsam
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2026/01/02
*/
#pragma once
#include <htslib/sam.h>
#include <spdlog/spdlog.h>
struct ReadUtils {
// sam/bam文件中的quality score要在原来的quality上加33应该是映射到字符区方便用字母表示
// htslib解析之后已经减掉了就是真实的质量分数
static constexpr int QUALITY_SCORE_ADD_IN_FILE = 33;
/**
* Resets the quality scores of the reads to the orginal (pre-BQSR) ones.
*/
static void resetOriginalBaseQualities(bam1_t *b) {
uint8_t* oq = bam_aux_get(b, "OQ");
char* oqVal = nullptr;
if (oq)
oqVal = bam_aux2Z(oq);
int crg = 0;
if (oqVal == nullptr) {
} else {
uint8_t* quals = bam_get_qual(b);
for (int i = 0; i < b->core.l_qseq; ++i) {
quals[i] = oqVal[i] - QUALITY_SCORE_ADD_IN_FILE; // 要减掉33
}
}
}
// get read group id
static const char* getReadGroupId(bam1_t *b) {
uint8_t* rgStr = bam_aux_get(b, "RG");
char* rgVal = nullptr;
if (rgStr)
rgVal = bam_aux2Z(rgStr);
return rgVal;
}
// 将当前quals保存在OQ tag中 如果当前read不存在OQ tag的情况下
static void setOriginalBaseQualsIfNoOQ(bam1_t *b) {
uint8_t* tagData = bam_aux_get(b, "OQ");
if (tagData)
return;
char* tagVal = nullptr;
const char* oldQual = (char*)bam_get_qual(b);
string qual(b->core.l_qseq + 1, '\0');
for (int i = 0; i < qual.size() - 1; ++i) {
qual[i] = oldQual[i] + QUALITY_SCORE_ADD_IN_FILE;
}
// bam_aux_append 最后一个字符必须是'\0'
if (bam_aux_append(b, "OQ", 'Z', qual.size(), (const uint8_t*)qual.c_str()) != 0) {
spdlog::error("Add OQ (original quality score) tag failed. ");
}
}
// 移除给定的tag
static void removeAttribute(bam1_t *b, const string &tag) {
uint8_t* tagData = bam_aux_get(b, "OQ");
if (!tagData)
return;
if (bam_aux_remove(b, tagData) == nullptr) {
spdlog::error("Remove tag {} failed. ", tag);
}
}
};

View File

@ -147,14 +147,18 @@ struct RecalDatum {
*/
inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
inline void setNumMismatches(double val) { numMismatches = val * MULTIPLIER; }
inline uint64_t getNumObservations() const { return numObservations; }
inline void setNumObservations(uint64_t val) { numObservations = val; }
inline double getReportedQuality() const { return reportedQuality; }
inline void setReportedQuality(double val) { reportedQuality = val; }
/**
* Computes the empirical quality of the datum, using the reported quality as the prior.
* @see #getEmpiricalQuality(double) below.
*/
double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
inline void setEmpiricalQuality(double val) { empiricalQuality = val; }
/**
* Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.

View File

@ -8,19 +8,55 @@
*/
#pragma once
#include "util/stable_array.h"
#include "util/sam_data.h"
#include <spdlog/spdlog.h>
#include <numeric>
#include "bqsr/aux_arg.h"
#include "util/bam_wrap.h"
#include "util/interval.h"
#include "util/vcf_parser.h"
#include "util/profiling.h"
#include "util/debug.h"
#include "util/interval.h"
#include "util/profiling.h"
#include "util/sam_data.h"
#include "util/stable_array.h"
#include "util/vcf_parser.h"
struct RecalFuncs {
static constexpr int MAX_SITES_INTERVAL = 100000;
//static constexpr int MAX_SITES_INTERVAL = 100000;
static constexpr int MAX_SITES_INTERVAL = 21500; // 经验值这个数读取vcf和计算的性能最好
static constexpr uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
// 过滤掉bqsr过程不符合要求的bam数据
static bool bqsrReadFilterOut(const bam1_t* b) {
// 过滤掉unmapped的read
if (b->core.qual == 0) // mapping quality 0
return true;
if (b->core.qual == 255) // mapping quality not available
return true;
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
return true;
}
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
return true;
}
if (b->core.flag & BAM_FDUP) { // secondary alignment
return true;
}
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
return true;
}
return false;
}
// 过滤掉apply bqsr过程不符合要求的bam数据
static bool applyBqsrReadFilterOut(const bam1_t* b) {
// 好像4.6.2的GATK版本的welformed filter啥也没过滤
// b->core.flag & BAM_FUNMAP ||
if ((b->core.tid == -1 || b->core.pos == -1) && !(b->core.flag & BAM_FMUNMAP)) { // unmapped
return true;
}
return false;
}
// 设置某个位置是indel
static inline void updateIndel(StableArray<int>& isIndel, int index) {
if (index >= 0 && index < isIndel.size()) {
@ -94,18 +130,16 @@ struct RecalFuncs {
}
// 获取一行字符串
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
line->clear();
if (*cur >= total)
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur) {
if (*cur >= total) {
(*cur)++;
return;
char b;
while (*cur < total && (b = buf[(*cur)++]) != '\n') {
line->push_back(b);
}
while (*cur < total && buf[(*cur)++] != '\n');
}
// 计算与read有交叉的已知位点信息 应该要判断一下是按照read的范围去读取vcf还是按照一个batch read的范围去读取
static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, StableArray<uint8_t>& knownSites, int thid) {
static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, int sitesStride, StableArray<uint8_t>& knownSites, int thid) {
BamWrap* bw = sd.bw;
int tid = bw->contig_id();
int64_t startPos = bw->start_pos(); // 闭区间使用clip之前的read匹配的范围
@ -113,7 +147,6 @@ struct RecalFuncs {
knownSites.resize_fill(sd.read_len, 0);
// update vcfs
// int idx = 0;
PROF_START(TP_read_vcf);
for (auto& vcf : vcfs) {
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam不需要读取
@ -130,7 +163,7 @@ struct RecalFuncs {
vcf.knownSites.clear(); // 清空因为后面会读入覆盖bam的所有vcf位点
// 读取新的interval
int64_t fpos, flen;
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
endPos = std::max(startPos + sitesStride, endPos);
Interval readIntv(startPos, endPos);
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
// fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen);
@ -143,10 +176,12 @@ struct RecalFuncs {
}
char* buf = vcf.buf;
vcf.inStm.read(buf, flen);
string line;
int64_t cur = 0;
get_line_from_buf(buf, flen, &cur, &line);
while (line.size() > 0) {
int64_t start = 0;
get_line_from_buf(buf, flen, &cur);
while (cur > start + 1) {
const string line(buf + start, buf + cur - 1);
// spdlog::info("s: {}, e: {}, line: {}", start, cur, line);
stringstream ss_line(line);
string stid;
int tid, pos;
@ -159,7 +194,8 @@ struct RecalFuncs {
if (varIntv.overlaps(readIntv)) {
vcf.knownSites.push_back(varIntv); // 闭区间
}
get_line_from_buf(buf, flen, &cur, &line);
start = cur;
get_line_from_buf(buf, flen, &cur);
}
}
}
@ -195,7 +231,6 @@ struct RecalFuncs {
knownSites[i] = true;
}
}
//idx += 1;
}
PROF_TP_END(TP_calc_skips);
//fprintf(gf[0], "\n");
@ -241,4 +276,53 @@ struct RecalFuncs {
calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
}
}
// for apply bqsr
/**
* Quality score recalibration algorithm works as follows:
* - Start with the (approximate, or "estimated") reported quality score. (Approximation occurs when marginalizing/collapsing
* over the reported qualities for each read group).
* - Compute (retrieve?) the empirical quality score using the per-read group datum (i.e. counts). Call it y_1.
* - Use y_1 just computed as the prior for the empirical quality score for the datum for the 2-tuple ( read group, quality score). Call it y_2.
* - Use y_2 as the prior to compute the empirical quality for the 3-tuple ( read-group, quality-score, special covariate ). Call it y_3 for the
* context covariate. Similarly define y_4 for the cycle covariate. Let d_3 = y_3 - y_2, d_4 = y_4 - y_2.
* - (final recalibrated score) = y_2 + d_3 + d_4 = y_3 + y_4 - y_2.
*
* @param priorQualityScore the prior quality score (in log space). It is either the "estimated" or collapsed reported quality score
* for the read group, or the constant prior if given. This value has type double because of the "combine" (or collapse)
* operation that collapses the quality scores represented within the same read group.
* @param readGroupDatum the RecalDatum object for a particular read group at hand. May be null.
* @param qualityScoreDatum the RecalDatum object for a particular (read group, reported quality) tuple at hand. May be null.
* @param specialCovariateDatums the array of RecalDatum objects for the non-required covariates (cycle and context covariates by default).
* @return
*/
static double hierarchicalBayesianQualityEstimate(double priorQualityScore, RecalDatum& readGroupDatum, RecalDatum& qualityScoreDatum,
RecalDatum& contextDatum, RecalDatum& cycleDatum) {
double empiricalQualityForReadGroup =
readGroupDatum.getNumObservations() == 0 ? priorQualityScore : readGroupDatum.getEmpiricalQuality(priorQualityScore);
double posteriorEmpiricalQualityForReportedQuality = qualityScoreDatum.getNumObservations() == 0
? empiricalQualityForReadGroup
: qualityScoreDatum.getEmpiricalQuality(empiricalQualityForReadGroup);
double deltaSpecialCovariates = 0.0;
// At this point we stop being iterative; the special covariates (context and cycle by default) are treated differently.
if (contextDatum.getNumObservations() > 0) {
// TODO: the prior is ignored if the empirical quality for the datum is already cached.
deltaSpecialCovariates +=
contextDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
}
if (cycleDatum.getNumObservations() > 0) {
// TODO: the prior is ignored if the empirical quality for the datum is already cached.
deltaSpecialCovariates +=
cycleDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
}
return posteriorEmpiricalQualityForReportedQuality + deltaSpecialCovariates;
}
// recalibrated quality is bound between 1 and MAX_QUAL
static uint8_t getBoundedIntegerQual(double recalibratedQualDouble) {
return QualityUtils::boundQual(MathUtils::fastRound(recalibratedQualDouble), RecalDatum::MAX_RECALIBRATED_Q_SCORE);
}
};

View File

@ -11,9 +11,10 @@
#include "covariate.h"
#include "nested_array.h"
#include "recal_datum.h"
#include "qual_utils.h"
struct RecalTables {
int qualDimension = 94; // MAX_SAM_QUAL_SCORE(93) + 1
int qualDimension = QualityUtils::MAX_SAM_QUAL_SCORE + 1; // MAX_SAM_QUAL_SCORE(93) + 1
int eventDimension = EventType::EVENT_SIZE;
int numReadGroups;
@ -41,4 +42,22 @@ struct RecalTables {
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
}
void initReadGroupTable(int _numReadGroups) {
numReadGroups = _numReadGroups;
readGroupTable.init(eventDimension, numReadGroups);
}
// 必须在调用initReadGroupTable之后或者设置了numReadGroups之后
void initQualityScoreTable() {
qualityScoreTable.init(eventDimension, numReadGroups, qualDimension);
}
void initContextTable() {
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
}
void initCycleTable() {
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
}
};

View File

@ -11,6 +11,8 @@
#include <spdlog/spdlog.h>
#include <cstdio>
#include <iostream>
#include <fstream>
#include "bqsr_args.h"
#include "nested_array.h"
@ -79,25 +81,6 @@ struct RecalUtils {
}
}
}
// fprintf(gf[4], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
// _Foreach3D(qualityScoreTable, val, {
// if (val.numObservations > 0)
// fprintf(gf[4], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
// });
// fprintf(gf[4], "\n");
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
// for (auto& arr1 : contextTable.data) {
// for (size_t si = 0; si < arr1.size(); ++si) {
// for (auto &arr2 : arr1[si]) {
// for (auto& val : arr2) {
// if (val.numObservations > 0)
// fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
// }
// }
// }
// }
// fprintf(gf[3], "\n");
}
// 输出bqsr报告
@ -110,7 +93,7 @@ struct RecalUtils {
exit(1);
}
// 输出version信息
fprintf(fpout, "%s\n", REPORT_HEADER_VERSION);
fprintf(fpout, "%s:%s\n", REPORT_HEADER_VERSION, REPORT_HEADER_MINOR_VERSION);
// 输出参数信息
outputArgsTable(RAC, fpout);
// 输出量化质量分数信息
@ -268,4 +251,220 @@ struct RecalUtils {
});
table.write(fpout);
}
// 解析bqsr table文件
static void readRecalTables(const string& recalTableFn, BQSRArg& RAC, vector<uint8_t>& quantizedScore, RecalTables& recalTables) {
std::ifstream infs(recalTableFn, ifstream::in);
string line;
// 读取文件头
std::getline(infs, line);
spdlog::info("header: {}", line);
if (line.find(REPORT_HEADER_VERSION) != 0) {
spdlog::error("BQSR table version not supported! {} {}", line, __LINE__);
exit(1);
}
// 读取参数表
readArgsTable(infs, RAC);
readQuantTable(infs, quantizedScore);
readReadGroupTable(infs, recalTables);
readQualityScoreTable(infs, recalTables);
readContextCycleTable(infs, recalTables);
infs.close();
}
template<typename Arr>
static void split(const string& str, Arr& dat) {
dat.clear();
std::istringstream iss(str);
string item;
while (iss >> item) {
dat.push_back(item);
}
}
template <typename Arr>
static void split(const string& str, const char delimiter, Arr& dat) {
dat.clear();
std::istringstream iss(str);
string token;
while (std::getline(iss, token, delimiter)) {
dat.push_back(token);
}
}
// 读取参数表
static void readArgsTable(std::ifstream& infs, BQSRArg& p) {
string line;
StableArray<string> dat;
std::getline(infs, line);
while (line.empty() && std::getline(infs, line)); // 跳过空行
split(line, ':', dat);
int argRowCnt = std::stoi(dat[3]);
std::getline(infs, line); // 忽略表格说明的一行
std::getline(infs, line); // 忽略表头column说明
for (int i = 0; i < argRowCnt; ++i) {
std::getline(infs, line);
split(line, dat);
if ("covariate" == dat[0]) {
} else if ("no_standard_covs" == dat[0]) {
p.DO_NOT_USE_STANDARD_COVARIATES = ReportUtil::ToBool(dat[1]);
} else if ("run_without_dbsnp" == dat[0]) {
p.RUN_WITHOUT_DBSNP = ReportUtil::ToBool(dat[1]);
} else if ("solid_recal_mode" == dat[0]) {
p.SOLID_RECAL_MODE = ReportUtil::ParseString(dat[1]);
} else if ("solid_nocall_strategy" == dat[0]) {
p.SOLID_NOCALL_STRATEGY = ReportUtil::ParseString(dat[1]);
} else if ("mismatches_context_size" == dat[0]) {
p.MISMATCHES_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
} else if ("indels_context_size" == dat[0]) {
p.INDELS_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
} else if ("mismatches_default_quality" == dat[0]) {
p.MISMATCHES_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
// spdlog::info("int8_t : {}", (int)p.MISMATCHES_DEFAULT_QUALITY);
} else if ("deletions_default_quality" == dat[0]) {
p.DELETIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
} else if ("insertions_default_quality" == dat[0]) {
p.INSERTIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
} else if ("maximum_cycle_value" == dat[0]) {
p.MAXIMUM_CYCLE_VALUE = ReportUtil::ToInt(dat[1]);
} else if ("low_quality_tail" == dat[0]) {
p.LOW_QUAL_TAIL = ReportUtil::ToInt(dat[1]);
} else if ("default_platform" == dat[0]) {
p.DEFAULT_PLATFORM = ReportUtil::ParseString(dat[1]);
} else if ("force_platform" == dat[0]) {
p.FORCE_PLATFORM = ReportUtil::ParseString(dat[1]);
} else if ("quantizing_levels" == dat[0]) {
p.QUANTIZING_LEVELS = ReportUtil::ToInt(dat[1]);
} else if ("recalibration_report" == dat[0]) {
p.existingRecalibrationReport = ReportUtil::ParseString(dat[1]);
} else if ("binary_tag_name" == dat[0]) {
p.BINARY_TAG_NAME = ReportUtil::ParseString(dat[1]);
// spdlog::info("BINARY_TAG_NAME : {}", p.BINARY_TAG_NAME);
} else {
spdlog::error("unknown argument : {} {}", dat[0], dat[1]);
}
}
}
// 读取量化质量分数信息表
static void readQuantTable(std::ifstream& infs, vector<uint8_t>& quantizedScore) {
string line;
StableArray<string> dat;
std::getline(infs, line);
while (line.empty() && std::getline(infs, line)); // 跳过空行
split(line, ':', dat);
int argRowCnt = std::stoi(dat[3]);
std::getline(infs, line); // 忽略表格说明的一行
std::getline(infs, line); // 忽略表头column说明
quantizedScore.resize(argRowCnt);
for (int i = 0; i < argRowCnt; ++i) {
std::getline(infs, line);
// spdlog::info("{}", line);
split(line, dat);
quantizedScore[ReportUtil::ToInt(dat[0])] = ReportUtil::ToInt(dat[2]);
// spdlog::info("{}, {}", ReportUtil::ToInt(dat[0]), quantizedScore[ReportUtil::ToInt(dat[0])]);
}
}
// 读取read group表
static void readReadGroupTable(std::ifstream& infs, RecalTables& recalTables) {
string line;
StableArray<string> dat;
std::getline(infs, line);
while (line.empty() && std::getline(infs, line)); // 跳过空行
split(line, ':', dat);
int argRowCnt = std::stoi(dat[3]);
std::getline(infs, line); // 忽略表格说明的一行
std::getline(infs, line); // 忽略表头column说明
vector<vector<string>> dats(argRowCnt);
int rgId = 0;
for (int i = 0; i < argRowCnt; ++i) {
std::getline(infs, line);
split(line, dats[i]);
if (ReadGroupCovariate::RgToId.find(dats[i][0]) == ReadGroupCovariate::RgToId.end()) {
ReadGroupCovariate::RgToId[dats[i][0]] = rgId;
ReadGroupCovariate::IdToRg[rgId] = dats[i][0];
++rgId;
}
}
recalTables.initReadGroupTable(rgId);
// spdlog::info("read group num: {}, {}", rgId, ReadGroupCovariate::IdToRg[0]);
for (int i = 0; i < dats.size(); ++i) {
int k2 = ReadGroupCovariate::RgToId[dats[i][0]];
int k1 = EventType::GetIndexForEventRep(dats[i][1][0]);
auto& datum = recalTables.readGroupTable(k1, k2);
datum.setEmpiricalQuality(ReportUtil::ToDouble(dats[i][2]));
datum.setReportedQuality(ReportUtil::ToDouble(dats[i][3]));
datum.setNumObservations(ReportUtil::ToUint64(dats[i][4]));
datum.setNumMismatches(ReportUtil::ToDouble(dats[i][5]));
}
}
// 读取质量分数表
static void readQualityScoreTable(std::ifstream& infs, RecalTables& recalTables) {
string line;
StableArray<string> dat;
std::getline(infs, line);
while (line.empty() && std::getline(infs, line)); // 跳过空行
split(line, ':', dat);
int argRowCnt = std::stoi(dat[3]);
std::getline(infs, line); // 忽略表格说明的一行
std::getline(infs, line); // 忽略表头column说明
recalTables.initQualityScoreTable();
int rgId = 0;
for (int i = 0; i < argRowCnt; ++i) {
std::getline(infs, line);
split(line, dat);
int k2 = ReadGroupCovariate::RgToId[dat[0]];
int k3 = ReportUtil::ToInt(dat[1]);
int k1 = EventType::GetIndexForEventRep(dat[2][0]);
auto& datum = recalTables.qualityScoreTable(k1, k2, k3);
datum.setEmpiricalQuality(ReportUtil::ToDouble(dat[3]));
datum.setReportedQuality((double)k3);
datum.setNumObservations(ReportUtil::ToUint64(dat[4]));
datum.setNumMismatches(ReportUtil::ToDouble(dat[5]));
}
}
// 读取context和cycle表
static void readContextCycleTable(std::ifstream& infs, RecalTables& recalTables) {
string line;
StableArray<string> dat;
std::getline(infs, line);
while (line.empty() && std::getline(infs, line)); // 跳过空行
split(line, ':', dat);
int argRowCnt = std::stoi(dat[3]);
std::getline(infs, line); // 忽略表格说明的一行
std::getline(infs, line); // 忽略表头column说明
recalTables.initContextTable();
recalTables.initCycleTable();
for (int i = 0; i < argRowCnt; ++i) {
std::getline(infs, line);
split(line, dat);
int k2 = ReadGroupCovariate::RgToId[dat[0]];
int k3 = ReportUtil::ToInt(dat[1]);
int k4 = 0;
int k1 = EventType::GetIndexForEventRep(dat[4][0]);
RecalDatum* dp;
if (dat[3] == "Cycle") {
k4 = CycleCovariate::KeyFromCycle(ReportUtil::ToInt(dat[2]));
dp = &recalTables.cycleTable(k1, k2, k3, k4);
} else if (dat[3] == "Context") {
k4 = ContextCovariate::KeyFromContext(dat[2]);
dp = &recalTables.contextTable(k1, k2, k3, k4);
} else {
spdlog::error("Unknown CovariateName {}", dat[3]);
}
dp->setEmpiricalQuality(ReportUtil::ToDouble(dat[5]));
dp->setReportedQuality((double)k3);
dp->setNumObservations(ReportUtil::ToUint64(dat[6]));
dp->setNumMismatches(ReportUtil::ToDouble(dat[7]));
}
}
};

View File

@ -12,10 +12,7 @@
#include "fastbqsr_version.h"
#include "bqsr/bqsr_args.h"
#include "util/profiling.h"
namespace nsgv {
extern BQSRArg gBqsrArg;
};
#include "bqsr/common_data.h"
int BaseRecalibrator();
int ApplyBQSR();
@ -30,28 +27,18 @@ string get_current_time_str() {
return string(now);
}
int main_BaseRecalibrator(int argc, char *argv[]) {
// init arg parser
argparse::ArgumentParser program(nsgv::gBqsrArg.PROGRAM_RECORD_ID, FASTBQSR_VERSION, argparse::default_arguments::none);
program.add_description(
"First pass of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
"user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
// 添加bqsr和apply bqsr的共同使用的参数
static void addCommonArgs(argparse::ArgumentParser &program, const char *outputDesc) {
program.add_argument("--input")
.help("BAM/SAM/CRAM file containing reads This argument must be specified at least once.")
.metavar("<INPUT>")
.required();
program.add_argument("--output")
.help("The output recalibration table file to create.")
.help(outputDesc)
.metavar("<OUTPUT>")
.required();
program.add_argument("--reference")
.help("Reference sequence file.")
.metavar("<Reference>")
.required();
program.add_argument("--num-threads")
.help("Number of threads to use.")
.scan<'i', int>()
@ -59,6 +46,66 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
.nargs(1)
.metavar("<NUM_THREADS>");
program.add_argument("--reference")
.help("Reference sequence file.")
.metavar("<Reference>")
.required();
}
// 添加帮助和版本信息
static void addHelpVersion(argparse::ArgumentParser& program) {
program.add_argument("-h", "--help")
.action([&](const auto& /*unused*/) {
std::cout << program.help().str();
std::exit(0);
})
.default_value(false)
.help("shows help message and exits")
.implicit_value(true)
.nargs(0);
program.add_argument("-v", "--version")
.action([&](const auto& /*unused*/) {
std::cout << FASTBQSR_VERSION << std::endl;
std::exit(0);
})
.default_value(false)
.help("prints version information and exits")
.implicit_value(true)
.nargs(0);
}
// 添加运行时间信息和命令行参数列表
static void addTimeCLI(int argc, char* argv[]) {
nsgv::gBqsrArg.START_TIME = get_current_time_str();
nsgv::gBqsrArg.CLI_STR = argv[0];
for (int i = 1; i < argc; ++i) {
nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
}
// spdlog::info("cmd: {}", nsgv::gBqsrArg.CLI_STR);
}
// 解析公共参数
static void parseCommonArgs(argparse::ArgumentParser& program) {
nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
if (nsgv::gBqsrArg.NUM_THREADS < 1) {
spdlog::error("num-threads must be positive.");
exit(1);
}
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
}
int main_BaseRecalibrator(int argc, char *argv[]) {
// init arg parser
argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
program.add_description(
"First phase of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
"user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
addCommonArgs(program, "The output recalibration table file to create.");
program.add_argument("--known-sites")
.help(
"One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from "
@ -66,6 +113,59 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
.metavar("<KnownSites>")
.nargs(argparse::nargs_pattern::any);
program.add_argument("--enable-baq")
.help("Whether to do BAQ correction.")
.default_value(false)
.implicit_value(true)
.hidden();
program.add_argument("--compute-indel-bqsr-tables")
.help("Whether to compute indel BQSR tables.")
.default_value(false)
.implicit_value(true)
.hidden();
// add help and version args
addHelpVersion(program);
try {
program.parse_args(argc, argv);
parseCommonArgs(program);
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
nsgv::gBqsrArg.computeIndelBQSRTables = program.get<bool>("--compute-indel-bqsr-tables");
// spdlog::info("known sites vcf files:");
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
// spdlog::info(" {}", ks);
// }
} catch (const std::exception &err) {
spdlog::error(err.what());
return 1;
}
spdlog::info("fast base recalibration phase-1 start");
BaseRecalibrator();
spdlog::info("fast base recalibration phase-1 end");
DisplayProfilingBQSR(nsgv::gBqsrArg.NUM_THREADS);
return 0;
}
int main_ApplyBQSR(int argc, char* argv[]) {
// init arg parser
argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
program.add_description(
"Second phase of the Base Quality Score Recalibration (BQSR) -- Apply the bqsr table to generate recalibrated bam file.)");
addCommonArgs(program, "The output bam/sam file to create.");
program.add_argument("--bqsr")
.help("Input recalibration table for BQSR.")
.metavar("<BQSR>")
.required();
program.add_argument("--create-index")
.help("Whether to create an index when writing coordinate sorted BAM output.")
.default_value(false)
@ -78,83 +178,37 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
.nargs(1)
.metavar("<IndexFormat>");
program.add_argument("--enable-baq")
.help("Whether to do BAQ correction.")
.default_value(false)
.implicit_value(true)
.hidden();
// add help and version args
program.add_argument("-h", "--help")
.action([&](const auto & /*unused*/) {
std::cout << program.help().str();
std::exit(0);
})
.default_value(false)
.help("shows help message and exits")
.implicit_value(true)
.nargs(0);
program.add_argument("-v", "--version")
.action([&](const auto & /*unused*/) {
std::cout << FASTBQSR_VERSION << std::endl;
std::exit(0);
})
.default_value(false)
.help("prints version information and exits")
.implicit_value(true)
.nargs(0);
// std::cout << program << std::endl;
nsgv::gBqsrArg.START_TIME = get_current_time_str();
nsgv::gBqsrArg.CLI_STR = argv[0];
for (int i = 1; i < argc; ++i) {
nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
}
addHelpVersion(program);
try {
program.parse_args(argc, argv);
nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
if (nsgv::gBqsrArg.NUM_THREADS < 1) {
spdlog::error("num-threads must be positive.");
exit(1);
}
parseCommonArgs(program);
nsgv::gBqsrArg.BQSR_FILE = program.get("--bqsr");
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
// spdlog::info("known sites vcf files:");
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
// spdlog::info(" {}", ks);
// }
string idxFormat = program.get("--index-format");
std::transform(idxFormat.begin(), idxFormat.end(), idxFormat.begin(), ::toupper);
nsgv::gBqsrArg.INDEX_FORMAT = idxFormat == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
nsgv::gBqsrArg.INDEX_FORMAT =
program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
} catch (const std::exception &err) {
} catch (const std::exception& err) {
spdlog::error(err.what());
return 1;
}
spdlog::info("fast base recalibration phase-1 start");
BaseRecalibrator();
spdlog::info("fast base recalibration phase-1 end");
spdlog::info("fast base recalibration phase-2 start");
ApplyBQSR();
spdlog::info("fast base recalibration phase-2 end");
DisplayProfiling(nsgv::gBqsrArg.NUM_THREADS);
// DisplayProfilingApplyBQSR(nsgv::gBqsrArg.NUM_THREADS);
DisplayProfilingApplyBQSR(1);
return 0;
}
int main_ApplyBQSR(int argc, char* argv[]) { return 0; }
int main(int argc, char *argv[]) {
// init log
spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr"));
spdlog::cfg::load_env_levels();
addTimeCLI(argc, argv);
string bqsr_prog = argv[1];
if (bqsr_prog == "BaseRecalibrator") {
return main_BaseRecalibrator(argc - 1, argv + 1);

View File

@ -45,8 +45,7 @@ static int CalcThreadTime(uint64_t *a, int len, double *max, double *min, double
fprintf(stderr, "time %-15s: avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \
}
int DisplayProfiling(int nthread) {
int DisplayProfilingBQSR(int nthread) {
#ifdef SHOW_PERF
fprintf(stderr, "\n");
PRINT_GP(GP_read);
@ -80,6 +79,27 @@ int DisplayProfiling(int nthread) {
}
PRINT_GP(GP_whole_process);
fprintf(stderr, "\n");
#endif
return 0;
}
int DisplayProfilingApplyBQSR(int nthread) {
#ifdef SHOW_PERF
fprintf(stderr, "\n");
PRINT_GP(GP_read);
if (nthread == 1) {
PRINT_GP(GP_covariate);
} else {
PRINT_TP(TP_covariate);
PRINT_TP(TP_readgroup);
PRINT_TP(TP_qualityscore);
PRINT_TP(TP_context);
PRINT_TP(TP_cycle);
}
PRINT_GP(GP_whole_process);
fprintf(stderr, "\n");
#endif

View File

@ -79,7 +79,8 @@ enum {
uint64_t RealtimeMsec(void);
int DisplayProfiling(int);
int DisplayProfilingBQSR(int);
int DisplayProfilingApplyBQSR(int);
#ifdef __cplusplus
}

View File

@ -22,24 +22,46 @@ using std::string;
using std::stringstream;
using std::vector;
#define REPORT_HEADER_VERSION "#:GATKReport.v1.1:5"
#define REPORT_HEADER_VERSION "#:GATKReport.v1.1"
#define REPORT_HEADER_MINOR_VERSION "5"
struct ReportUtil {
static string ToString(const bool val) { return val ? "true" : "false"; }
static string ToString(const char val) {
static inline string ToString(const bool val) { return val ? "true" : "false"; }
static inline string ToString(const char val) {
string s = "";
s += val;
// spdlog::info("char: {}, str: {}", val, s);
return s;
}
static string ToString(const string& val) { return val == "" ? "null" : val; }
static string ToString(const double val, int precise) {
static inline string ToString(const string& val) { return val == "" ? "null" : val; }
static inline string ToString(const double val, int precise) {
stringstream ss;
ss << std::fixed << std::setprecision(precise) << val;
return ss.str();
}
template<typename T>
static string ToString(const T val) { return std::to_string(val); }
static inline string ToString(const T val) { return std::to_string(val); }
// 转换成bool
static inline bool ToBool(const string &val) {
if (val == "false" || val == "False" || val == "FALSE" || val.empty())
return false;
return true;
}
static inline string ParseString(const string &val) {
if (val == "null")
return "";
return val;
}
static inline int ToInt(const string& val) { return std::stoi(val); }
static inline double ToDouble(const string& val) { return std::stod(val); }
static inline int64_t ToInt64(const string& val) { return std::stoll(val); }
static inline uint64_t ToUint64(const string& val) { return std::stoull(val); }
};
struct ReportTable {

View File

@ -54,7 +54,8 @@ struct SamData {
StableArray<uint8_t> base_quals; // 对应的质量分数
StableArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
StableArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
StableArray<Cigar> cigars;
StableArray<Cigar> cigars; // clip 之后的cigar
StableArray<uint8_t> recaled_quals; // 保存校正之后的质量分数
// 用作临时buffer
StableArray<char> strandedClippedBases; // for context covariate
@ -87,6 +88,20 @@ struct SamData {
end_pos = 0;
}
// 解析apply bqsr里用到的信息
void parseForApplyBQSR(BamWrap *_bw) {
bw = _bw;
read_len = bw->b->core.l_qseq;
bases.resize(read_len);
base_quals.resize(read_len);
uint8_t* seq = bam_get_seq(bw->b);
uint8_t* quals = bam_get_qual(bw->b);
for (int i = 0; i < read_len; ++i) {
bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i)];
base_quals[i] = quals[i];
}
}
// 初步解析bam
void parseBasic(BamWrap *_bw) {
bw = _bw;