实现了apply bqsr的单线程版本，结果还有点错误，继续调试

2026-01-03 22:10:46 +08:00 · 2026-01-03 22:10:46 +08:00 · 94e06338cd
parent 985875ebac
commit 94e06338cd
19 changed files with 1147 additions and 209 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -6,5 +6,5 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 # set(CMAKE_BUILD_TYPE Debug)
 # set(CMAKE_BUILD_TYPE Release)
 add_definitions(-DSHOW_PERF)
-#add_definitions(-DSHOW_PERF=1)
+# add_definitions(-DSHOW_PERF=1)
 add_subdirectory(src)
--- a/README.md
+++ b/README.md
@ -2,4 +2,6 @@

 对GATK bqsr部分进行并行优化

+对应的GATK版本：The Genome Analysis Toolkit (GATK) v4.6.2.0-17-g2a1f41b-SNAPSHOT
+
 Implement the same functions as GATK for BaseRecalibrator and ApplyBQSR
--- a/src/bqsr/apply_bqsr_entry.cpp
+++ b/src/bqsr/apply_bqsr_entry.cpp
@ -0,0 +1,336 @@
+// 一. 读取并解析recalibrate tables
+
+// 二. 循环处理每一个read
+
+// 1. 计算read的协变量covs
+
+// 2. 根据read的协变量信息从recal tables获取对应的数据
+
+// 3. 根据recal tables数据对read的每个碱基分别计算新的质量分数
+
+// 4. 将计算后的质量分数赋值给read
+
+#include <header.h>  // in htslib
+#include <htslib/sam.h>
+#include <htslib/thread_pool.h>
+
+#include "aux_arg.h"
+#include "common_data.h"
+#include "util/debug.h"
+#include "util/profiling.h"
+#include "recal_utils.h"
+#include "recal_funcs.h"
+#include "read_utils.h"
+
+using std::vector;
+
+namespace nsgv {
+// 全局变量 for apply bqsr
+samFile* gOutBamFp;        // 输出文件, sam或者bam格式
+sam_hdr_t* gOutBamHeader;  // 输出文件的header
+RecalTables gRecalTables;  // 保留一个全局的recalibrate tables就行了
+vector<uint8_t> gQuantizedQuals; // 读取quantized info table信息得到的，第三列数据
+StableArray<bam1_t*> gRecalBams[2];  // 保存已经校正过质量分数的bam数据，双buffer
+};  // namespace nsgv
+
+// 串行apply bqsr
+int SerialApplyBQSR(AuxVar &aux) {
+    BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
+    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::applyBqsrReadFilterOut);
+    int64_t readNumSum = 0;
+    int round = 0;
+
+    PerReadCovariateMatrix& readCovariates = aux.readCovariates;
+    // 一. 读取并解析recalibrate tables
+    auto& recalTables = *aux.bqsrTable;
+
+    // 全局的校正后的bam数组
+    auto& recalBams = nsgv::gRecalBams[0];
+
+    auto& sd = aux.sd;
+
+    while (true) {
+        ++round;
+        // 一. 读取bam数据
+        size_t readNum = 0;
+        PROF_START(GP_read);
+        if (inBamBuf.ReadStat() >= 0)
+            readNum = inBamBuf.ReadBam();
+        PROF_GP_END(GP_read);
+        if (readNum < 1) {
+            break;
+        }
+        auto bams = inBamBuf.GetBamArr();
+        spdlog::info("{} reads processed in {} round", readNum, round);
+
+        if (recalBams.size() < bams.size()) {
+            int start = recalBams.size();
+            recalBams.resize(bams.size());
+            for (int i = start; i < recalBams.size(); ++i) {
+                recalBams[i] = bam_init1();
+            }
+        }
+
+        // 二. 遍历每个bam（read）记录，进行处理
+        for (int i = 0; i < bams.size(); ++i) {
+            if (bam_copy1(recalBams[i], bams[i]->b) == NULL) {
+                spdlog::error("Copy bam error");
+                exit(1);
+            }
+            BamWrap* bw = bams[i];
+            bw->b = recalBams[i]; // 注意这里的赋值，然后就可以对b进行更改了
+            sd.init();
+            sd.parseForApplyBQSR(bw);
+            sd.rid = i + aux.processedReads;
+
+            // 1. 是否使用original quality来代替当前的quality
+            if (nsgv::gBqsrArg.useOriginalBaseQualities)
+                ReadUtils::resetOriginalBaseQualities(bw->b);
+
+            // 2. 是否将当前的quality保存在tag OQ中
+            if (nsgv::gBqsrArg.emitOriginalQuals)
+                ReadUtils::setOriginalBaseQualsIfNoOQ(bw->b);
+
+            // 3. 计算read的协变量covs
+            PROF_START(GP_covariate);
+            CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
+            PROF_GP_END(GP_covariate);
+
+            // clear indel qualities
+            ReadUtils::removeAttribute(bw->b, "BI");
+            ReadUtils::removeAttribute(bw->b, "BD");
+
+            // 4. 检查read的readGroup tag是否包含在bqsr table里
+            const char* readGroupId = ReadUtils::getReadGroupId(bw->b);
+            auto& covaritesForRead = readCovariates[EventType::BASE_SUBSTITUTION.index];
+            uint8_t* recalibratedQuals = bam_get_qual(bw->b);
+            auto& preUpdateQuals = sd.base_quals;
+            int rgKey = -1;
+            if (ReadGroupCovariate::RgToId.find(std::string(readGroupId)) != ReadGroupCovariate::RgToId.end())
+                rgKey = ReadGroupCovariate::RgToId[std::string(readGroupId)];
+            if (rgKey == -1) {
+                if (nsgv::gBqsrArg.allowMissingReadGroups) {
+                    // Given the way the recalibration code is implemented below, we cannot recalibrate a read with a
+                    // read group that's not in the recal table.
+                    for (int i = 0; i < sd.read_len; i++) {
+                        //recalibratedQuals[i] = staticQuantizedMapping != null ? staticQuantizedMapping[preUpdateQuals[i]] : quantizedQuals.get(preUpdateQuals[i]);
+                        recalibratedQuals[i] = nsgv::gQuantizedQuals[preUpdateQuals[i]];
+                    }
+                } else {
+                    spdlog::error(
+                        "Read group {} not found in the recalibration table. Use \"--allow-missing-read-group\" command line argument to ignore this "
+                        "error.",
+                        readGroupId);
+                    exit(1);
+                }
+            }
+
+            // 5. 根据recal tables数据对read的每个碱基分别计算新的质量分数
+            auto& readGroupDatum = recalTables.readGroupTable(EventType::BASE_SUBSTITUTION.index, rgKey);
+            // Note: this loop is under very heavy use in applyBQSR. Keep it slim.
+            for (int offset = 0; offset < sd.read_len; offset++) {  // recalibrate all bases in the read
+                // only recalibrate usable qualities (default: >= 6) (the original quality will come from the instrument -- reported quality)
+                if (recalibratedQuals[offset] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN) {
+                    continue;
+                }
+                auto& covs = readCovariates[EventType::BASE_SUBSTITUTION.index][offset];
+                // 根据read的协变量数据获取对应的bqsr数据
+                auto& qualityScoreDatum = recalTables.qualityScoreTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality);
+                auto& contextDatum = recalTables.contextTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.context);
+                auto& cycleDatum = recalTables.cycleTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.cycle);
+                // 计算校正后的质量分数
+                double priorQualityScore =
+                    nsgv::gBqsrArg.globalQScorePrior > 0.0 ? nsgv::gBqsrArg.globalQScorePrior : readGroupDatum.getReportedQuality();
+                double rawRecalibratedQualityScore =
+                    RecalFuncs::hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, contextDatum, cycleDatum);
+                uint8_t qualIdx = RecalFuncs::getBoundedIntegerQual(rawRecalibratedQualityScore);
+                uint8_t quantizedQualityScore = qualIdx;  // nsgv::gQuantizedQuals.at(qualIdx);
+                // TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be
+                // quantized once.
+                // recalibratedQuals[offset] = staticQuantizedMapping == null ? quantizedQualityScore : staticQuantizedMapping[quantizedQualityScore];
+                recalibratedQuals[offset] = quantizedQualityScore;
+            }
+
+            if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
+                spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str());
+                sam_close(nsgv::gInBamFp);
+                sam_close(nsgv::gOutBamFp);
+                exit(1);
+            }
+            //break;
+        }
+        // break;
+
+        readNumSum += readNum;
+        AuxVar::processedReads += readNum;
+        inBamBuf.ClearAll();  // 清空bam buf
+    }
+
+    spdlog::info("read count: {}", readNumSum);
+
+    return 0;
+}
+
+// 并行 apply bqsr
+int ParallelApplyBQSR(vector<AuxVar> &auxArr) {
+
+    return 0;
+}
+
+// 在进行数据处理之前，初始化一些全局数据
+static void globalInit() {
+    open_debug_files();
+    /* bam */
+    nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
+    if (!nsgv::gInBamFp) {
+        spdlog::error("[{}] load sam/bam file failed.\n", __func__);
+        exit(1);
+    }
+    hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
+    nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp);  // header
+
+    /* 并行读取bam数据  */
+    htsThreadPool htsPoolRead = {NULL, 0};  //
+    int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
+    htsPoolRead.pool = hts_tpool_init(readThreadNum);
+    if (!htsPoolRead.pool) {
+        spdlog::error("[{}] failed to set up thread pool", __LINE__);
+        sam_close(nsgv::gInBamFp);
+        exit(1);
+    }
+    hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
+
+    if (!nsgv::gInBamHeader->hrecs) {
+        if (sam_hdr_fill_hrecs(nsgv::gInBamHeader) != 0) {
+            spdlog::error("[{}] failed to read sam header", __LINE__);
+            sam_close(nsgv::gInBamFp);
+            exit(1);
+        }
+    }
+    /* 并行写入bam文件 */
+    char modeout[12] = "wb";
+    sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL);
+    nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout);
+    if (!nsgv::gOutBamFp) {
+        spdlog::error("[{}] create output sam/bam file failed.\n", __func__);
+        exit(1);
+    }
+    nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader);
+    // 修改输出文件的header
+    sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", PROGRAM_NAME, "VN", FASTBQSR_VERSION, "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL);
+    // 用不同的线程数量处理输出文件
+    htsThreadPool htsPoolWrite = {NULL, 0};  // 读写用不同的线程池
+    int writeThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_WRITE_MAX_THREAD);
+    htsPoolWrite.pool = hts_tpool_init(writeThreadNum);
+    if (!htsPoolWrite.pool) {
+        spdlog::error("[{}] failed to set up thread pool", __LINE__);
+        sam_close(nsgv::gInBamFp);
+        exit(1);
+    }
+    hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
+    hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
+
+    if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) {
+        spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE);
+        sam_close(nsgv::gOutBamFp);
+        sam_close(nsgv::gInBamFp);
+        exit(1);
+    }
+
+    // 输出index文件
+    nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".bai";  // min_shift = 0 是bai格式
+    if ("sam" == Utils::getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) {
+        nsgv::gBqsrArg.INDEX_FILE = "";
+    }
+    if (!nsgv::gBqsrArg.INDEX_FILE.empty()) {
+        int index_min_shift = 0;
+        if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) {
+            nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".csi";
+            index_min_shift = 14;
+        }
+        if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, nsgv::gBqsrArg.INDEX_FILE.c_str()) < 0) {
+            spdlog::error("failed to open index \"{}\" for writing", nsgv::gBqsrArg.INDEX_FILE);
+            sam_close(nsgv::gOutBamFp);
+            sam_close(nsgv::gInBamFp);
+            exit(1);
+        }
+    }
+
+    // 1. 协变量数据相关初始化
+    ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
+    CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
+
+    // 注意初始化顺序，这个必须在协变量初始化之后，因为需要用到MaximumKeyValue
+    // nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg);
+
+    // 初始化AuxVar
+    nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
+    for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
+        nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
+        nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
+        if (nsgv::gAuxVars[i].faidx == 0)
+            error("[%s] fail to load the fasta index.\n", __func__);
+        // 注意初始化顺序，这个必须在协变量初始化之后，因为需要用到MaximumKeyValue
+        nsgv::gAuxVars[i].bqsrTable = &nsgv::gRecalTables;
+        CovariateUtils::InitPerReadCovMat(nsgv::gAuxVars[i].readCovariates);
+    }
+
+    // 0. 初始化一些全局数据
+    RecalDatum::StaticInit();
+    QualityUtils::StaticInit();
+    MathUtils::StaticInit();
+    BaseUtils::StaticInit();
+
+    // 初始化需要计算的event types
+    RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables);
+
+    // 2. 读取bam的read group
+    // if (nsgv::gInBamHeader->hrecs->nrg == 0) {
+    //     spdlog::error("No RG tag found in the header!");
+    //     exit(1);
+    // }
+
+    /*
+    // 应该是从bqsr table里读取read group信息
+    for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
+        // spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
+        ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
+        ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
+    }
+    */
+
+    // 读取并解析recalibrate tables
+    RecalUtils::readRecalTables(nsgv::gBqsrArg.BQSR_FILE, nsgv::gBqsrArg, nsgv::gQuantizedQuals, nsgv::gRecalTables);
+}
+
+// 全局资源释放
+static void globalDestroy() { 
+    close_debug_files();
+
+    if (!nsgv::gBqsrArg.INDEX_FILE.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) {
+        spdlog::error("writing index failed");
+        sam_close(nsgv::gOutBamFp);
+        sam_close(nsgv::gInBamFp);
+        exit(1);
+    }
+
+    /* 关闭文件，收尾清理 */
+    sam_close(nsgv::gOutBamFp);
+    sam_close(nsgv::gInBamFp);
+}
+
+// entrance of BQSR phase-1
+int ApplyBQSR() {
+    int ret = 0;
+
+    PROF_START(GP_whole_process);
+    globalInit();
+//    if (nsgv::gBqsrArg.NUM_THREADS == 1)
+        ret = SerialApplyBQSR(nsgv::gAuxVars[0]);  // 串行处理数据，生成recal bams
+//    else
+//        ret = ParallelApplyBQSR(nsgv::gAuxVars);  // 并行处理数据，生成recal bams
+    globalDestroy();
+    PROF_GP_END(GP_whole_process);
+
+    return ret;
+}
--- a/src/bqsr/aux_arg.h
+++ b/src/bqsr/aux_arg.h
@ -43,11 +43,14 @@ struct AuxVar {
    vector<VCFParser> vcfArr;  // 从vcf中获取已知位点

    PerReadCovariateMatrix readCovariates;  // 每个read对应的协变量矩阵
-    RecalTables recalTables;  // 每个线程对应一个recal tables
+    RecalTables recalTables;  // 每个线程对应一个recal tables，计算table的时候需要每个线程都有一个，输出bam的时候不需要，因为只读取，不写入

    SamData sd;
    StableArray<int> isSNP, isIns, isDel;  // 该位置是否是SNP, indel位置，0不是，1是
    StableArray<uint8_t> baqArray;
    StableArray<double> snpErrors, insErrors, delErrors;
    StableArray<uint8_t> skips;  // 该位置是否是已知位点
+
+    // only for apply bqsr
+    RecalTables* bqsrTable; // 保留一份就够
 };
--- a/src/bqsr/bqsr_args.h
+++ b/src/bqsr/bqsr_args.h
@ -11,6 +11,8 @@ Date : 2025/10/10
 #include <string>
 #include <vector>

+#include "qual_utils.h"
+
 using std::string;
 using std::vector;

@ -36,19 +38,19 @@ struct BQSRArg {
    /* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
    bool CREATE_INDEX = true;

+    string INDEX_FILE;
+
    nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI;

    /* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
    bool ADD_PG_TAG_TO_READS = true;

-    //
+    // 命令行字符串
    string CLI_STR;

-    //
+    // 开始运行时间
    string START_TIME;

-    string PROGRAM_RECORD_ID = "FastBQSR";
-
    // reference file
    string REFERENCE_FILE;

@ -137,14 +139,6 @@ struct BQSRArg {
     */
    double BAQGOP = 40;

-    /**
-     * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in
-     * the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since base callers use these
-     * values to indicate random or bad bases. For example, Illumina writes Q2 bases when the machine has really gone wrong.
-     * This would be fine in and of itself, but when you select a subset of these reads based on their ability to align to the
-     * reference and their dinucleotide effect, your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
-     */
-    int PRESERVE_QSCORES_LESS_THAN = 6;

    /**
     * enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了，用不到了？
@ -152,7 +146,7 @@ struct BQSRArg {
    bool enableBAQ = false;

    /**
-     * compute-indel-bqsr-tables, compute indel BQSR tables"
+     * compute-indel-bqsr-tables, compute indel BQSR tables" hidden
     */
    bool computeIndelBQSRTables = false;

@ -162,16 +156,93 @@ struct BQSRArg {
    //
    // --------------------------------------------------------------------------------------------------------------

-    /**
-     * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
-     * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
-     * tag is present for a read, the standard qual score will be used.
-     */
-    bool useOriginalBaseQualities = false;

    /**
     * If reads are missing some or all base quality scores, this value will be used for all base quality scores.
     * By default this is set to -1 to disable default base quality assignment.
     */
    int8_t defaultBaseQualities = -1;
+
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // args for apply bqsr
+    string BQSR_FILE;
+
+    /**
+     * Turns on the base quantization module. It requires a recalibration report.
+     *
+     * A value of 0 here means "do not quantize".
+     * Any value greater than zero will be used to recalculate the quantization using that many levels.
+     * Negative values mean that we should quantize using the recalibration report's quantization level.
+     */
+    // @Argument(fullName = "quantize-quals", doc = "Quantize quality scores to a given number of levels", optional = true) 
+    int quantizationLevels = 0;
+
+    /**
+     * Static quantized quals are entirely separate from the quantize_qual option which uses dynamic binning.
+     * The two types of binning should not be used together.
+     *
+     * For example, the Warp germline pipeline uses the static bins { 10, 20, 30, 40 }
+     */
+    //@Advanced @Argument(fullName = static-quantized-quals,
+    // doc = "Use static quantized quality scores to a given number of levels (with -" +
+    // StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME + ")", optional = true, mutex = "quantize-quals")
+    vector<int> staticQuantizationQuals;
+
+    /**
+     * Round down quantized only works with the static_quantized_quals option, and should not be used with
+     * the dynamic binning option provided by quantize_quals.  When roundDown = false, rounding is done in
+     * probability space to the nearest bin.  When roundDown = true, the value is rounded to the nearest bin
+     * that is smaller than the current bin.
+     */
+    // @Advanced @Argument(fullName = "round-down-quantized", doc = "Round quals down to nearest quantized qual", optional = true,
+    // mutex = "quantize-quals") public 
+    bool roundDown = false;
+
+    /**
+     * The tool is capable of writing out the original quality scores of each read in the recalibrated output file
+     * under the "OQ" tag. By default, this behavior is disabled because emitting original qualities results in a
+     * significant increase of the file size. Use this flag to turn on emission of original qualities.
+     */
+    //@Argument(fullName = "emit-original-quals", doc = "Emit original base qualities under the OQ tag",
+    // optional = true)
+    bool emitOriginalQuals = false;
+
+    /**
+     * If specified, the value of this argument will be used as a flat prior for all mismatching quality scores instead
+     * of the reported quality score (assigned by the sequencer).
+     */
+    // @Argument(fullName = "global-qscore-prior", doc = "Global Qscore Bayesian prior to use for BQSR",
+    // optional = true)
+    double globalQScorePrior = -1.0;
+
+    /**
+     * If set to true, do not throw an error upon encountering a read with a read group that's not in the recalibration table.
+     * Instead, simply set the quantized original base qualities as the recalibrated base qualities.
+     */
+    // @Argument(fullName = allow-missing-read-group, doc = "Do not throw an error when encountering a read group not in the recal table",
+    // optional = true)
+    bool allowMissingReadGroups = false;
+
+    /**
+     * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out
+     * unmodified in the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since
+     * base callers use these values to indicate random or bad bases. For example, Illumina writes Q2 bases when the
+     * machine has really gone wrong. This would be fine in and of itself, but when you select a subset of these reads
+     * based on their ability to align to the reference and their dinucleotide effect, your Q2 bin can be elevated to
+     * Q8 or Q10, leading to issues downstream.
+     */
+    // @Argument(fullName = "preserve-qscores-less-than", doc = "Don't recalibrate bases with quality scores less than this threshold", optional = true,
+    // minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) 
+    int PRESERVE_QSCORES_LESS_THAN = QualityUtils::MIN_USABLE_Q_SCORE;
+
+    /**
+     * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
+     * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
+     * tag is present for a read, the standard quality score will be used.
+     */
+    // @Argument(fullName = use-original-qualities, shortName = "OQ", doc = "Use the base quality scores from the OQ tag",
+    // optional = true) 
+    bool useOriginalBaseQualities = false;
 };
--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -1,13 +1,12 @@
 /*
-Description:
-bam，bam，bam
+Description: bqsr的第一阶段，生成协变量统计表的程序入口

 Copyright : All right reserved by ICT

 Author : Zhang Zhonghai
-Date : 2023/10/23
+Date : 2025/11/23
 */
-#include <header.h>
+#include <header.h> // in htslib
 #include <htslib/faidx.h>
 #include <htslib/kstring.h>
 #include <htslib/sam.h>
@ -44,11 +43,10 @@ Date : 2023/10/23
 #include "util/stable_array.h"
 #include "util/utils.h"
 #include "util/vcf_parser.h"
+#include "common_data.h"

 using std::deque;

-#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
-
 namespace nsgv {

 // 全局变量 for bqsr
@ -58,29 +56,6 @@ sam_hdr_t* gInBamHeader;  // input BAM header
 vector<AuxVar> gAuxVars;  // auxiliary variables，保存一些文件，数据等，每个线程对应一个
 };  // namespace nsgv

-
-// 过滤掉bqsr过程不符合要求的bam数据
-bool bqsrReadFilterOut(const bam1_t *b) {
-    // 过滤掉unmapped的read
-    if (b->core.qual == 0) // mapping quality 0
-        return true;
-    if (b->core.qual == 255) // mapping quality not available
-        return true;
-    if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
-        return true;
-    }
-    if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
-        return true;
-    }
-    if (b->core.flag & BAM_FDUP) {  // secondary alignment
-        return true;
-    }
-    if (b->core.flag & BAM_FQCFAIL) {  // Not passing quality controls
-        return true;
-    }
-    return false;
-}
-
 // 数据总结
 void collapseQualityScoreTableToReadGroupTable(Array2D<RecalDatum> &byReadGroupTable, Array3D<RecalDatum> &byQualTable) {
    // 遍历quality table
@ -137,7 +112,7 @@ static void printRecalTables(const RecalTables& rt) {
 // 串行bqsr
 int SerialBQSR(AuxVar &aux) {
    BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
-    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
+    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
    int64_t readNumSum = 0;
    int round = 0;

@ -227,7 +202,7 @@ int SerialBQSR(AuxVar &aux) {

            // 8. 计算这条read对应的协变量
            PROF_START(GP_covariate);
-            CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, 0);
+            CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
            PROF_GP_END(GP_covariate);

            // fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len);
@ -251,7 +226,7 @@ int SerialBQSR(AuxVar &aux) {

            // 9. 计算这条read需要跳过的位置
            PROF_START(GP_read_vcf);
-            RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, 0);
+            RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, RecalFuncs::MAX_SITES_INTERVAL, skips, 0);
            for (int ii = 0; ii < sd.read_len; ++ii) {
                skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
                            sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
@ -336,6 +311,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
    int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size());
 #endif
    // spdlog::info("tid {}, index {}, steal {}", tid, idx, steal);
+    // spdlog::info("interval span: {}", bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos());
+    int sitesStride = bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos();
+    // sitesStride = sitesStride >= RecalFuncs::MAX_SITES_INTERVAL ? sitesStride : RecalFuncs::MAX_SITES_INTERVAL;
+    sitesStride = RecalFuncs::MAX_SITES_INTERVAL;
    aux.threadProcessedReads += stopIdx - startIdx;
    for (int i = startIdx; i < stopIdx; ++i) {
        // spdlog::info("Thread {} processing read idx: {}", tid, i);
@ -364,10 +343,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
        if (!baqCalculated) continue;

        PROF_START(TP_covariate);
-        CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, thid);
+        CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, thid);
        PROF_TP_END(TP_covariate);

-        RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, thid);
+        RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, sitesStride, skips, thid);
        for (int ii = 0; ii < sd.read_len; ++ii) {
            skips[ii] =
                skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
@ -390,7 +369,7 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
 // 并行bqsr
 int ParallelBQSR(vector<AuxVar>& auxArr) {
    BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
-    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
+    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
    int64_t readNumSum = 0;
    int round = 0;

@ -484,8 +463,8 @@ static void globalInit() {

    /* 并行读取bam数据  */
    htsThreadPool htsPoolRead = {NULL, 0};   //
-    int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, 8);
-    htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
+    int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
+    htsPoolRead.pool = hts_tpool_init(readThreadNum);
    if (!htsPoolRead.pool ) {
        spdlog::error("[{}] failed to set up thread pool", __LINE__);
        sam_close(nsgv::gInBamFp);
@ -549,9 +528,10 @@ static void globalInit() {
 // 全局资源释放
 static void globalDestroy() {
    close_debug_files();
+    sam_close(nsgv::gInBamFp);
 }

-// entrance of mark BQSR
+// entrance of BQSR phase-1
 int BaseRecalibrator() {
    int ret = 0;

@ -562,7 +542,6 @@ int BaseRecalibrator() {
    else
        ret = ParallelBQSR(nsgv::gAuxVars);  // 并行处理数据，生成recal table
    globalDestroy();
-    sam_close(nsgv::gInBamFp);
    PROF_GP_END(GP_whole_process);

    return ret;
--- a/src/bqsr/common_data.h
+++ b/src/bqsr/common_data.h
@ -0,0 +1,30 @@
+/*
+     Description: 共用的一些宏定义，全局变量等
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2026/01/01
+*/
+
+#pragma once
+
+#include "bqsr_args.h"
+#include <htslib/sam.h>
+#include "aux_arg.h"
+#include "fastbqsr_version.h"
+
+#define BAM_BLOCK_SIZE 16L * 1024 * 1024  // 16M
+#define BAM_READ_MAX_THREAD 8
+#define BAM_WRITE_MAX_THREAD 16
+
+#define PROGRAM_NAME "FastBQSR"
+
+namespace nsgv {
+
+// 全局变量 for bqsr
+extern BQSRArg gBqsrArg;         // bqsr arguments
+extern samFile* gInBamFp;        // input BAM file pointer
+extern sam_hdr_t* gInBamHeader;  // input BAM header
+extern vector<AuxVar> gAuxVars;  // auxiliary variables，保存一些文件，数据等，每个线程对应一个
+};  // namespace nsgv
--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -250,19 +250,20 @@ void ContextCovariate::GetStrandedClippedBytes(SamData& sd, StableArray<char>& c
 * @param end    the end position in the array (exclusive)
 * @return the key representing the dna sequence
 */
-int ContextCovariate::KeyFromContext(const StableArray<char>& dna, const int start, const int end) {
-    int key = end - start;
-    int bitOffset = LENGTH_BITS;
-    for (int i = start; i < end; i++) {
-        const int baseIndex = baseIndexMap[dna[i] & 0xff];
-        if (baseIndex == -1) {  // ignore non-ACGT bases
-            return -1;
-        }
-        key |= (baseIndex << bitOffset);
-        bitOffset += 2;
-    }
-    return key;
-}
+// template<typename Arr> // 好像是类的模板函数必须在头文件里实现
+// int ContextCovariate::KeyFromContext(const Arr& dna, const int start, const int end) {
+//     int key = end - start;
+//     int bitOffset = LENGTH_BITS;
+//     for (int i = start; i < end; i++) {
+//         const int baseIndex = baseIndexMap[dna[i] & 0xff];
+//         if (baseIndex == -1) {  // ignore non-ACGT bases
+//             return -1;
+//         }
+//         key |= (baseIndex << bitOffset);
+//         bitOffset += 2;
+//     }
+//     return key;
+// }

 /**
 * For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
@ -334,6 +335,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const StableArray<char>& bas
 void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int originalReadLength = sd.read_len;

+    const char* qname = bam_get_qname(sd.bw->b);
    // store the original bases and then write Ns over low quality ones
    auto &strandedClippedBases = sd.strandedClippedBases;
    strandedClippedBases.copy(sd.bases);
@ -348,13 +350,13 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
    // since the context covariate may not span the entire set of values in read covariates
    // due to the clipping of the low quality bases
    // 这段代码应该不会执行，因为clip with N不会改变read长度
-    if (readLengthAfterClipping != originalReadLength) {
-        // don't bother zeroing out if we are going to overwrite the whole array
-        for (int i = 0; i < originalReadLength; i++) {
-            // this base has been clipped off, so zero out the covariate values here
-            CovariateUtils::SetContext(0, 0, 0, i, values);
-        }
-    }
+    // if (readLengthAfterClipping != originalReadLength) {
+    //     // don't bother zeroing out if we are going to overwrite the whole array
+    //     for (int i = 0; i < originalReadLength; i++) {
+    //         // this base has been clipped off, so zero out the covariate values here
+    //         CovariateUtils::SetContext(0, 0, 0, i, values);
+    //     }
+    // }

    const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag();
    // Note: duplicated the loop to avoid checking recordIndelValues on each iteration
--- a/src/bqsr/covariate.h
+++ b/src/bqsr/covariate.h
@ -27,7 +27,7 @@ using std::vector;

 // 协变量的值, 4个协变量
 struct CovariateValues {
-    int readGroup = 0;
+    int readGroup = 0; // 默认是read group 0
    int baseQuality = 0;
    int context = -1;
    int cycle = -1;
@ -61,6 +61,16 @@ struct EventType {
    static EventTypeValue BASE_INSERTION;
    static EventTypeValue BASE_DELETION;
    static vector<EventTypeValue> EVENTS;
+    static int GetIndexForEventRep(char eventType) {
+        if (eventType == BASE_SUBSTITUTION.representation) {
+            return BASE_SUBSTITUTION.index;
+        } else if (eventType == BASE_INSERTION.representation) {
+            return BASE_DELETION.index;
+        } else if (eventType == BASE_DELETION.representation) {
+            return BASE_DELETION.index;
+        }
+        return 0;
+    }
 };

 // Read group协变量
@ -201,7 +211,22 @@ struct ContextCovariate {
    // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
    static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
    // Creates a int representation of a given dna string.
-    static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
+    template <typename Arr>
+    static int KeyFromContext(const Arr& dna, const int start, const int end) {
+        int key = end - start;
+        int bitOffset = LENGTH_BITS;
+        for (int i = start; i < end; i++) {
+            const int baseIndex = baseIndexMap[dna[i] & 0xff];
+            if (baseIndex == -1) {  // ignore non-ACGT bases
+                return -1;
+            }
+            key |= (baseIndex << bitOffset);
+            bitOffset += 2;
+        }
+        return key;
+    }
+
+    static int KeyFromContext(const string& dna) { return KeyFromContext(dna, 0, dna.size()); }
    // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
    static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);

@ -219,6 +244,7 @@ struct CycleCovariate {

    static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }

+    static int KeyFromCycle(const int cycle) { return KeyFromCycle(cycle, MAXIMUM_CYCLE_VALUE); }
    /**
     * Encodes the cycle number as a key.
     */
--- a/src/bqsr/read_utils.h
+++ b/src/bqsr/read_utils.h
@ -0,0 +1,71 @@
+/*
+     Description: apply bqsr过程中对sam的一些修改函数
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2026/01/02
+*/
+#pragma once
+
+#include <htslib/sam.h>
+#include <spdlog/spdlog.h>
+
+struct ReadUtils {
+    // sam/bam文件中的quality score要在原来的quality上加33，应该是映射到字符区，方便用字母表示
+    // htslib解析之后已经减掉了，就是真实的质量分数
+    static constexpr int QUALITY_SCORE_ADD_IN_FILE = 33;
+    /**
+     * Resets the quality scores of the reads to the orginal (pre-BQSR) ones.
+     */
+    static void resetOriginalBaseQualities(bam1_t *b) {
+        uint8_t* oq = bam_aux_get(b, "OQ");
+        char* oqVal = nullptr;
+        if (oq)
+            oqVal = bam_aux2Z(oq);
+        int crg = 0;
+        if (oqVal == nullptr) {
+        } else {
+            uint8_t* quals = bam_get_qual(b);
+            for (int i = 0; i < b->core.l_qseq; ++i) {
+                quals[i] = oqVal[i] - QUALITY_SCORE_ADD_IN_FILE; // 要减掉33
+            }
+        }
+    }
+
+    // get read group id
+    static const char* getReadGroupId(bam1_t *b) {
+        uint8_t* rgStr = bam_aux_get(b, "RG");
+        char* rgVal = nullptr;
+        if (rgStr)
+            rgVal = bam_aux2Z(rgStr);
+        return rgVal;
+    }
+
+    // 将当前quals保存在OQ tag中 （如果当前read不存在OQ tag的情况下）
+    static void setOriginalBaseQualsIfNoOQ(bam1_t *b) {
+        uint8_t* tagData = bam_aux_get(b, "OQ");
+        if (tagData)
+            return;
+        char* tagVal = nullptr;
+        const char* oldQual = (char*)bam_get_qual(b);
+        string qual(b->core.l_qseq + 1, '\0');
+        for (int i = 0; i < qual.size() - 1; ++i) {
+            qual[i] = oldQual[i] + QUALITY_SCORE_ADD_IN_FILE;
+        }
+        // bam_aux_append 最后一个字符必须是'\0'
+        if (bam_aux_append(b, "OQ", 'Z', qual.size(), (const uint8_t*)qual.c_str()) != 0) {
+            spdlog::error("Add OQ (original quality score) tag failed. ");
+        }
+    }
+
+    // 移除给定的tag
+    static void removeAttribute(bam1_t *b, const string &tag) {
+        uint8_t* tagData = bam_aux_get(b, "OQ");
+        if (!tagData)
+            return;
+        if (bam_aux_remove(b, tagData) == nullptr) {
+            spdlog::error("Remove tag {} failed. ", tag);
+        }
+    }
+};
--- a/src/bqsr/recal_datum.h
+++ b/src/bqsr/recal_datum.h
@ -147,14 +147,18 @@ struct RecalDatum {
     */
    inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
    inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
+    inline void setNumMismatches(double val) { numMismatches = val * MULTIPLIER; }
    inline uint64_t getNumObservations() const { return numObservations; }
+    inline void setNumObservations(uint64_t val) { numObservations = val; }
    inline double getReportedQuality() const { return reportedQuality; }
+    inline void setReportedQuality(double val) { reportedQuality = val; }

    /**
     * Computes the empirical quality of the datum, using the reported quality as the prior.
     * @see #getEmpiricalQuality(double) below.
     */
    double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
+    inline void setEmpiricalQuality(double val) { empiricalQuality = val; }

    /**
     * Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.
--- a/src/bqsr/recal_funcs.h
+++ b/src/bqsr/recal_funcs.h
@ -8,19 +8,55 @@
 */
 #pragma once

-#include "util/stable_array.h"
-#include "util/sam_data.h"
+#include <spdlog/spdlog.h>
+#include <numeric>
+
 #include "bqsr/aux_arg.h"
 #include "util/bam_wrap.h"
-#include "util/interval.h"
-#include "util/vcf_parser.h"
-#include "util/profiling.h"
 #include "util/debug.h"
+#include "util/interval.h"
+#include "util/profiling.h"
+#include "util/sam_data.h"
+#include "util/stable_array.h"
+#include "util/vcf_parser.h"

 struct RecalFuncs {
-    static constexpr int MAX_SITES_INTERVAL = 100000;
+    //static constexpr int MAX_SITES_INTERVAL = 100000;
+    static constexpr int MAX_SITES_INTERVAL = 21500; // 经验值，这个数读取vcf和计算的性能最好
    static constexpr uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';

+    // 过滤掉bqsr过程不符合要求的bam数据
+    static bool bqsrReadFilterOut(const bam1_t* b) {
+        // 过滤掉unmapped的read
+        if (b->core.qual == 0)  // mapping quality 0
+            return true;
+        if (b->core.qual == 255)  // mapping quality not available
+            return true;
+        if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) {  // unmapped
+            return true;
+        }
+        if (b->core.flag & BAM_FSECONDARY) {  // secondary alignment
+            return true;
+        }
+        if (b->core.flag & BAM_FDUP) {  // secondary alignment
+            return true;
+        }
+        if (b->core.flag & BAM_FQCFAIL) {  // Not passing quality controls
+            return true;
+        }
+        return false;
+    }
+
+    // 过滤掉apply bqsr过程不符合要求的bam数据
+    static bool applyBqsrReadFilterOut(const bam1_t* b) {
+        // 好像4.6.2的GATK版本的welformed filter啥也没过滤
+        // b->core.flag & BAM_FUNMAP ||
+        if ((b->core.tid == -1 || b->core.pos == -1) && !(b->core.flag & BAM_FMUNMAP)) {  // unmapped
+            return true;
+        }
+        return false;
+    }
+
    // 设置某个位置是indel
    static inline void updateIndel(StableArray<int>& isIndel, int index) {
        if (index >= 0 && index < isIndel.size()) {
@ -94,18 +130,16 @@ struct RecalFuncs {
    }

    // 获取一行字符串
-    static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
-        line->clear();
-        if (*cur >= total)
+    static void get_line_from_buf(char* buf, int64_t total, int64_t* cur) {
+        if (*cur >= total) {
+            (*cur)++;
            return;
-        char b;
-        while (*cur < total && (b = buf[(*cur)++]) != '\n') {
-            line->push_back(b);
        }
+        while (*cur < total && buf[(*cur)++] != '\n');
    }

    // 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
-    static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, StableArray<uint8_t>& knownSites, int thid) {
+    static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, int sitesStride, StableArray<uint8_t>& knownSites, int thid) {
        BamWrap* bw = sd.bw;
        int tid = bw->contig_id();
        int64_t startPos = bw->start_pos();  // 闭区间，使用clip之前的read匹配的范围
@ -113,7 +147,6 @@ struct RecalFuncs {
        knownSites.resize_fill(sd.read_len, 0);

        // update vcfs
-        // int idx = 0;
        PROF_START(TP_read_vcf);
        for (auto& vcf : vcfs) {
            if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam，不需要读取
@ -130,7 +163,7 @@ struct RecalFuncs {
            vcf.knownSites.clear(); // 清空，因为后面会读入覆盖bam的所有vcf位点
            // 读取新的interval
            int64_t fpos, flen;
-            endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
+            endPos = std::max(startPos + sitesStride, endPos);
            Interval readIntv(startPos, endPos);
            vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
            // fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen);
@ -143,10 +176,12 @@ struct RecalFuncs {
                }
                char* buf = vcf.buf;
                vcf.inStm.read(buf, flen);
-                string line;
                int64_t cur = 0;
-                get_line_from_buf(buf, flen, &cur, &line);
-                while (line.size() > 0) {
+                int64_t start = 0;
+                get_line_from_buf(buf, flen, &cur);
+                while (cur > start + 1) {
+                    const string line(buf + start, buf + cur - 1);
+                    // spdlog::info("s: {}, e: {}, line: {}", start, cur, line);
                    stringstream ss_line(line);
                    string stid;
                    int tid, pos;
@ -159,7 +194,8 @@ struct RecalFuncs {
                    if (varIntv.overlaps(readIntv)) {
                        vcf.knownSites.push_back(varIntv);  // 闭区间
                    }
-                    get_line_from_buf(buf, flen, &cur, &line);
+                    start = cur;
+                    get_line_from_buf(buf, flen, &cur);
                }
            }
        }
@ -195,7 +231,6 @@ struct RecalFuncs {
                    knownSites[i] = true;
                }
            }
-            //idx += 1;
        }
        PROF_TP_END(TP_calc_skips);
        //fprintf(gf[0], "\n");
@ -241,4 +276,53 @@ struct RecalFuncs {
            calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
        }
    }
+
+
+    // for apply bqsr
+    /**
+     * Quality score recalibration algorithm works as follows:
+     * - Start with the (approximate, or "estimated") reported quality score. (Approximation occurs when marginalizing/collapsing
+     * over the reported qualities for each read group).
+     * - Compute (retrieve?) the empirical quality score using the per-read group datum (i.e. counts). Call it y_1.
+     * - Use y_1 just computed as the prior for the empirical quality score for the datum for the 2-tuple ( read group, quality score). Call it y_2.
+     * - Use y_2 as the prior to compute the empirical quality for the 3-tuple ( read-group, quality-score, special covariate ). Call it y_3 for the
+     * context covariate. Similarly define y_4 for the cycle covariate. Let d_3 = y_3 - y_2, d_4 = y_4 - y_2.
+     * - (final recalibrated score) = y_2 + d_3 + d_4 = y_3 + y_4 - y_2.
+     *
+     * @param priorQualityScore the prior quality score (in log space). It is either the "estimated" or collapsed reported quality score
+     *                          for the read group, or the constant prior if given. This value has type double because of the "combine" (or collapse)
+     * operation that collapses the quality scores represented within the same read group.
+     * @param readGroupDatum the RecalDatum object for a particular read group at hand. May be null.
+     * @param qualityScoreDatum the RecalDatum object for a particular (read group, reported quality) tuple at hand. May be null.
+     * @param specialCovariateDatums the array of RecalDatum objects for the non-required covariates (cycle and context covariates by default).
+     * @return
+     */
+    static double hierarchicalBayesianQualityEstimate(double priorQualityScore, RecalDatum& readGroupDatum, RecalDatum& qualityScoreDatum,
+                                               RecalDatum& contextDatum, RecalDatum& cycleDatum) {
+        double empiricalQualityForReadGroup =
+            readGroupDatum.getNumObservations() == 0 ? priorQualityScore : readGroupDatum.getEmpiricalQuality(priorQualityScore);
+        double posteriorEmpiricalQualityForReportedQuality = qualityScoreDatum.getNumObservations() == 0
+                                                                 ? empiricalQualityForReadGroup
+                                                                 : qualityScoreDatum.getEmpiricalQuality(empiricalQualityForReadGroup);
+
+        double deltaSpecialCovariates = 0.0;
+        // At this point we stop being iterative; the special covariates (context and cycle by default) are treated differently.
+        if (contextDatum.getNumObservations() > 0) {
+            // TODO: the prior is ignored if the empirical quality for the datum is already cached.
+            deltaSpecialCovariates +=
+                contextDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
+        }
+        if (cycleDatum.getNumObservations() > 0) {
+            // TODO: the prior is ignored if the empirical quality for the datum is already cached.
+            deltaSpecialCovariates +=
+                cycleDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
+        }
+
+        return posteriorEmpiricalQualityForReportedQuality + deltaSpecialCovariates;
+    }
+
+    // recalibrated quality is bound between 1 and MAX_QUAL
+    static uint8_t getBoundedIntegerQual(double recalibratedQualDouble) {
+        return QualityUtils::boundQual(MathUtils::fastRound(recalibratedQualDouble), RecalDatum::MAX_RECALIBRATED_Q_SCORE);
+    }
 };
--- a/src/bqsr/recal_tables.h
+++ b/src/bqsr/recal_tables.h
@ -11,9 +11,10 @@
 #include "covariate.h"
 #include "nested_array.h"
 #include "recal_datum.h"
+#include "qual_utils.h"

 struct RecalTables {
-    int qualDimension = 94;  // MAX_SAM_QUAL_SCORE(93) + 1
+    int qualDimension = QualityUtils::MAX_SAM_QUAL_SCORE + 1;  // MAX_SAM_QUAL_SCORE(93) + 1
    int eventDimension = EventType::EVENT_SIZE;
    int numReadGroups;

@ -41,4 +42,22 @@ struct RecalTables {
        contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
        cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
    }
+
+    void initReadGroupTable(int _numReadGroups) {
+        numReadGroups = _numReadGroups;
+        readGroupTable.init(eventDimension, numReadGroups);
+    }
+
+    // 必须在调用initReadGroupTable之后，或者设置了numReadGroups之后
+    void initQualityScoreTable() {
+        qualityScoreTable.init(eventDimension, numReadGroups, qualDimension);
+    }
+
+    void initContextTable() {
+        contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
+    }
+
+    void initCycleTable() {
+        cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
+    }
 };
--- a/src/bqsr/recal_utils.h
+++ b/src/bqsr/recal_utils.h
@ -11,6 +11,8 @@
 #include <spdlog/spdlog.h>

 #include <cstdio>
+#include <iostream>
+#include <fstream>

 #include "bqsr_args.h"
 #include "nested_array.h"
@ -79,25 +81,6 @@ struct RecalUtils {
                }
            }
        }
-        // fprintf(gf[4], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
-        // _Foreach3D(qualityScoreTable, val, {
-        //     if (val.numObservations > 0)
-        //         fprintf(gf[4], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
-        // });
-        // fprintf(gf[4], "\n");
-
-        // fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
-        // for (auto& arr1 : contextTable.data) {
-        //     for (size_t si = 0; si < arr1.size(); ++si) {
-        //         for (auto &arr2 : arr1[si]) {
-        //             for (auto& val : arr2) {
-        //                 if (val.numObservations > 0)
-        //                     fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
-        //             }
-        //         }
-        //     }
-        // }
-        // fprintf(gf[3], "\n");
    }

    // 输出bqsr报告
@ -110,7 +93,7 @@ struct RecalUtils {
            exit(1);
        }
        // 输出version信息
-        fprintf(fpout, "%s\n", REPORT_HEADER_VERSION);
+        fprintf(fpout, "%s:%s\n", REPORT_HEADER_VERSION, REPORT_HEADER_MINOR_VERSION);
        // 输出参数信息
        outputArgsTable(RAC, fpout);
        // 输出量化质量分数信息
@ -268,4 +251,220 @@ struct RecalUtils {
        });
        table.write(fpout);
    }
+
+    // 解析bqsr table文件
+    static void readRecalTables(const string& recalTableFn, BQSRArg& RAC, vector<uint8_t>& quantizedScore, RecalTables& recalTables) {
+        std::ifstream infs(recalTableFn, ifstream::in);
+        string line;
+        // 读取文件头
+        std::getline(infs, line);
+        spdlog::info("header: {}", line);
+        if (line.find(REPORT_HEADER_VERSION) != 0) {
+            spdlog::error("BQSR table version not supported! {} {}", line, __LINE__);
+            exit(1);
+        }
+
+        // 读取参数表
+        readArgsTable(infs, RAC);
+
+        readQuantTable(infs, quantizedScore);
+
+        readReadGroupTable(infs, recalTables);
+
+        readQualityScoreTable(infs, recalTables);
+
+        readContextCycleTable(infs, recalTables);
+
+        infs.close();
+    }
+
+    template<typename Arr>
+    static void split(const string& str, Arr& dat) {
+        dat.clear();
+        std::istringstream iss(str);
+        string item;
+        while (iss >> item) {
+            dat.push_back(item);
+        }
+    }
+
+    template <typename Arr>
+    static void split(const string& str, const char delimiter, Arr& dat) {
+        dat.clear();
+        std::istringstream iss(str);
+        string token;
+        while (std::getline(iss, token, delimiter)) {
+            dat.push_back(token);
+        }
+    }
+
+    // 读取参数表
+    static void readArgsTable(std::ifstream& infs, BQSRArg& p) {
+        string line;
+        StableArray<string> dat;
+        std::getline(infs, line);
+        while (line.empty() && std::getline(infs, line));  // 跳过空行
+        split(line, ':', dat);
+        int argRowCnt = std::stoi(dat[3]);
+        std::getline(infs, line); // 忽略表格说明的一行
+        std::getline(infs, line); // 忽略表头column说明
+        for (int i = 0; i < argRowCnt; ++i) {
+            std::getline(infs, line);
+            split(line, dat);
+            if ("covariate" == dat[0]) {
+            } else if ("no_standard_covs" == dat[0]) {
+                p.DO_NOT_USE_STANDARD_COVARIATES = ReportUtil::ToBool(dat[1]);
+            } else if ("run_without_dbsnp" == dat[0]) {
+                p.RUN_WITHOUT_DBSNP = ReportUtil::ToBool(dat[1]);
+            } else if ("solid_recal_mode" == dat[0]) {
+                p.SOLID_RECAL_MODE = ReportUtil::ParseString(dat[1]);
+            } else if ("solid_nocall_strategy" == dat[0]) {
+                p.SOLID_NOCALL_STRATEGY = ReportUtil::ParseString(dat[1]);
+            } else if ("mismatches_context_size" == dat[0]) {
+                p.MISMATCHES_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
+            } else if ("indels_context_size" == dat[0]) {
+                p.INDELS_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
+            } else if ("mismatches_default_quality" == dat[0]) {
+                p.MISMATCHES_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
+                // spdlog::info("int8_t : {}", (int)p.MISMATCHES_DEFAULT_QUALITY);
+            } else if ("deletions_default_quality" == dat[0]) {
+                p.DELETIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
+            } else if ("insertions_default_quality" == dat[0]) {
+                p.INSERTIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
+            } else if ("maximum_cycle_value" == dat[0]) {
+                p.MAXIMUM_CYCLE_VALUE = ReportUtil::ToInt(dat[1]);
+            } else if ("low_quality_tail" == dat[0]) {
+                p.LOW_QUAL_TAIL = ReportUtil::ToInt(dat[1]);
+            } else if ("default_platform" == dat[0]) {
+                p.DEFAULT_PLATFORM = ReportUtil::ParseString(dat[1]);
+            } else if ("force_platform" == dat[0]) {
+                p.FORCE_PLATFORM = ReportUtil::ParseString(dat[1]);
+            } else if ("quantizing_levels" == dat[0]) {
+                p.QUANTIZING_LEVELS = ReportUtil::ToInt(dat[1]);
+            } else if ("recalibration_report" == dat[0]) {
+                p.existingRecalibrationReport = ReportUtil::ParseString(dat[1]);
+            } else if ("binary_tag_name" == dat[0]) {
+                p.BINARY_TAG_NAME = ReportUtil::ParseString(dat[1]);
+                // spdlog::info("BINARY_TAG_NAME : {}", p.BINARY_TAG_NAME);
+            } else {
+                spdlog::error("unknown argument : {} {}", dat[0], dat[1]);
+            }
+        }
+    }
+
+    // 读取量化质量分数信息表
+    static void readQuantTable(std::ifstream& infs, vector<uint8_t>& quantizedScore) {
+        string line;
+        StableArray<string> dat;
+        std::getline(infs, line);
+        while (line.empty() && std::getline(infs, line)); // 跳过空行
+        split(line, ':', dat);
+        int argRowCnt = std::stoi(dat[3]);
+        std::getline(infs, line);  // 忽略表格说明的一行
+        std::getline(infs, line);  // 忽略表头column说明
+        quantizedScore.resize(argRowCnt);
+        for (int i = 0; i < argRowCnt; ++i) {
+            std::getline(infs, line);
+            // spdlog::info("{}", line);
+            split(line, dat);
+            quantizedScore[ReportUtil::ToInt(dat[0])] = ReportUtil::ToInt(dat[2]);
+            // spdlog::info("{}, {}", ReportUtil::ToInt(dat[0]), quantizedScore[ReportUtil::ToInt(dat[0])]);
+        }
+    }
+
+    // 读取read group表
+    static void readReadGroupTable(std::ifstream& infs, RecalTables& recalTables) {
+        string line;
+        StableArray<string> dat;
+        std::getline(infs, line);
+        while (line.empty() && std::getline(infs, line));  // 跳过空行
+        split(line, ':', dat);
+        int argRowCnt = std::stoi(dat[3]);
+        std::getline(infs, line);  // 忽略表格说明的一行
+        std::getline(infs, line);  // 忽略表头column说明
+        vector<vector<string>> dats(argRowCnt);
+        int rgId = 0;
+        for (int i = 0; i < argRowCnt; ++i) {
+            std::getline(infs, line);
+            split(line, dats[i]);
+            if (ReadGroupCovariate::RgToId.find(dats[i][0]) == ReadGroupCovariate::RgToId.end()) {
+                ReadGroupCovariate::RgToId[dats[i][0]] = rgId;
+                ReadGroupCovariate::IdToRg[rgId] = dats[i][0];
+                ++rgId;
+            }
+        }
+        recalTables.initReadGroupTable(rgId);
+        // spdlog::info("read group num: {}, {}", rgId, ReadGroupCovariate::IdToRg[0]);
+        for (int i = 0; i < dats.size(); ++i) {
+            int k2 = ReadGroupCovariate::RgToId[dats[i][0]];
+            int k1 = EventType::GetIndexForEventRep(dats[i][1][0]);
+            auto& datum = recalTables.readGroupTable(k1, k2);
+            datum.setEmpiricalQuality(ReportUtil::ToDouble(dats[i][2]));
+            datum.setReportedQuality(ReportUtil::ToDouble(dats[i][3]));
+            datum.setNumObservations(ReportUtil::ToUint64(dats[i][4]));
+            datum.setNumMismatches(ReportUtil::ToDouble(dats[i][5]));
+        }
+    }
+
+    // 读取质量分数表
+    static void readQualityScoreTable(std::ifstream& infs, RecalTables& recalTables) {
+        string line;
+        StableArray<string> dat;
+        std::getline(infs, line);
+        while (line.empty() && std::getline(infs, line));  // 跳过空行
+        split(line, ':', dat);
+        int argRowCnt = std::stoi(dat[3]);
+        std::getline(infs, line);  // 忽略表格说明的一行
+        std::getline(infs, line);  // 忽略表头column说明
+        recalTables.initQualityScoreTable();
+        int rgId = 0;
+        for (int i = 0; i < argRowCnt; ++i) {
+            std::getline(infs, line);
+            split(line, dat);
+            int k2 = ReadGroupCovariate::RgToId[dat[0]];
+            int k3 = ReportUtil::ToInt(dat[1]);
+            int k1 = EventType::GetIndexForEventRep(dat[2][0]);
+            auto& datum = recalTables.qualityScoreTable(k1, k2, k3);
+            datum.setEmpiricalQuality(ReportUtil::ToDouble(dat[3]));
+            datum.setReportedQuality((double)k3);
+            datum.setNumObservations(ReportUtil::ToUint64(dat[4]));
+            datum.setNumMismatches(ReportUtil::ToDouble(dat[5]));
+        }
+    }
+
+    // 读取context和cycle表
+    static void readContextCycleTable(std::ifstream& infs, RecalTables& recalTables) {
+        string line;
+        StableArray<string> dat;
+        std::getline(infs, line);
+        while (line.empty() && std::getline(infs, line));  // 跳过空行
+        split(line, ':', dat);
+        int argRowCnt = std::stoi(dat[3]);
+        std::getline(infs, line);  // 忽略表格说明的一行
+        std::getline(infs, line);  // 忽略表头column说明
+        recalTables.initContextTable();
+        recalTables.initCycleTable();
+        for (int i = 0; i < argRowCnt; ++i) {
+            std::getline(infs, line);
+            split(line, dat);
+            int k2 = ReadGroupCovariate::RgToId[dat[0]];
+            int k3 = ReportUtil::ToInt(dat[1]);
+            int k4 = 0;
+            int k1 = EventType::GetIndexForEventRep(dat[4][0]);
+            RecalDatum* dp;
+            if (dat[3] == "Cycle") {
+                k4 = CycleCovariate::KeyFromCycle(ReportUtil::ToInt(dat[2]));
+                dp = &recalTables.cycleTable(k1, k2, k3, k4);
+            } else if (dat[3] == "Context") {
+                k4 = ContextCovariate::KeyFromContext(dat[2]);
+                dp = &recalTables.contextTable(k1, k2, k3, k4);
+            } else {
+                spdlog::error("Unknown CovariateName {}", dat[3]);
+            }
+            dp->setEmpiricalQuality(ReportUtil::ToDouble(dat[5]));
+            dp->setReportedQuality((double)k3);
+            dp->setNumObservations(ReportUtil::ToUint64(dat[6]));
+            dp->setNumMismatches(ReportUtil::ToDouble(dat[7]));
+        }
+    }
 };
--- a/src/main.cpp
+++ b/src/main.cpp
@ -12,10 +12,7 @@
 #include "fastbqsr_version.h"
 #include "bqsr/bqsr_args.h"
 #include "util/profiling.h"
-
-namespace nsgv {
-extern BQSRArg gBqsrArg;
-};
+#include "bqsr/common_data.h"

 int BaseRecalibrator();
 int ApplyBQSR();
@ -30,28 +27,18 @@ string get_current_time_str() {
    return string(now);
 }

-int main_BaseRecalibrator(int argc, char *argv[]) {
-    // init arg parser
-    argparse::ArgumentParser program(nsgv::gBqsrArg.PROGRAM_RECORD_ID, FASTBQSR_VERSION, argparse::default_arguments::none);
-    program.add_description(
-        "First pass of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
-        "user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
-
+// 添加bqsr和apply bqsr的共同使用的参数
+static void addCommonArgs(argparse::ArgumentParser &program, const char *outputDesc) {
    program.add_argument("--input")
        .help("BAM/SAM/CRAM file containing reads  This argument must be specified at least once.")
        .metavar("<INPUT>")
        .required();

    program.add_argument("--output")
-        .help("The output recalibration table file to create.")
+        .help(outputDesc)
        .metavar("<OUTPUT>")
        .required();

-    program.add_argument("--reference")
-        .help("Reference sequence file.")
-        .metavar("<Reference>")
-        .required();
-
    program.add_argument("--num-threads")
        .help("Number of threads to use.")
        .scan<'i', int>()
@ -59,6 +46,66 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
        .nargs(1)
        .metavar("<NUM_THREADS>");

+    program.add_argument("--reference")
+        .help("Reference sequence file.")
+        .metavar("<Reference>")
+        .required();
+}
+
+// 添加帮助和版本信息
+static void addHelpVersion(argparse::ArgumentParser& program) {
+    program.add_argument("-h", "--help")
+        .action([&](const auto& /*unused*/) {
+            std::cout << program.help().str();
+            std::exit(0);
+        })
+        .default_value(false)
+        .help("shows help message and exits")
+        .implicit_value(true)
+        .nargs(0);
+
+    program.add_argument("-v", "--version")
+        .action([&](const auto& /*unused*/) {
+            std::cout << FASTBQSR_VERSION << std::endl;
+            std::exit(0);
+        })
+        .default_value(false)
+        .help("prints version information and exits")
+        .implicit_value(true)
+        .nargs(0);
+}
+
+// 添加运行时间信息和命令行参数列表
+static void addTimeCLI(int argc, char* argv[]) {
+    nsgv::gBqsrArg.START_TIME = get_current_time_str();
+    nsgv::gBqsrArg.CLI_STR = argv[0];
+    for (int i = 1; i < argc; ++i) {
+        nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
+    }
+    // spdlog::info("cmd: {}", nsgv::gBqsrArg.CLI_STR);
+}
+
+// 解析公共参数
+static void parseCommonArgs(argparse::ArgumentParser& program) {
+    nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
+    nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
+    nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
+    if (nsgv::gBqsrArg.NUM_THREADS < 1) {
+        spdlog::error("num-threads must be positive.");
+        exit(1);
+    }
+    nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
+}
+
+int main_BaseRecalibrator(int argc, char *argv[]) {
+    // init arg parser
+    argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
+    program.add_description(
+        "First phase of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
+        "user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
+
+    addCommonArgs(program, "The output recalibration table file to create.");
+
    program.add_argument("--known-sites")
        .help(
            "One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from "
@ -66,6 +113,59 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
        .metavar("<KnownSites>")
        .nargs(argparse::nargs_pattern::any);

+    program.add_argument("--enable-baq")
+        .help("Whether to do BAQ correction.")
+        .default_value(false)
+        .implicit_value(true)
+        .hidden();
+
+    program.add_argument("--compute-indel-bqsr-tables")
+        .help("Whether to compute indel BQSR tables.")
+        .default_value(false)
+        .implicit_value(true)
+        .hidden();
+
+        // add help and version args
+    addHelpVersion(program);
+
+    try {
+        program.parse_args(argc, argv);
+        parseCommonArgs(program);
+
+        nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
+        nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
+        nsgv::gBqsrArg.computeIndelBQSRTables = program.get<bool>("--compute-indel-bqsr-tables");
+        // spdlog::info("known sites vcf files:");
+        // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
+        //     spdlog::info("  {}", ks);
+        // }
+    } catch (const std::exception &err) {
+        spdlog::error(err.what());
+        return 1;
+    }
+
+    spdlog::info("fast base recalibration phase-1 start");
+    BaseRecalibrator();
+    spdlog::info("fast base recalibration phase-1 end");
+
+    DisplayProfilingBQSR(nsgv::gBqsrArg.NUM_THREADS);
+
+    return 0;
+}
+
+int main_ApplyBQSR(int argc, char* argv[]) {
+    // init arg parser
+    argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
+    program.add_description(
+        "Second phase of the Base Quality Score Recalibration (BQSR) -- Apply the bqsr table to generate recalibrated bam file.)");
+    
+    addCommonArgs(program, "The output bam/sam file to create.");
+
+    program.add_argument("--bqsr")
+        .help("Input recalibration table for BQSR.")
+        .metavar("<BQSR>")
+        .required();
+
    program.add_argument("--create-index")
        .help("Whether to create an index when writing coordinate sorted BAM output.")
        .default_value(false)
@ -78,83 +178,37 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
        .nargs(1)
        .metavar("<IndexFormat>");

-    program.add_argument("--enable-baq")
-        .help("Whether to do BAQ correction.")
-        .default_value(false)
-        .implicit_value(true)
-        .hidden();
-
-    // add help and version args
-    program.add_argument("-h", "--help")
-        .action([&](const auto & /*unused*/) {
-            std::cout << program.help().str();
-            std::exit(0);
-        })
-        .default_value(false)
-        .help("shows help message and exits")
-        .implicit_value(true)
-        .nargs(0);
-
-    program.add_argument("-v", "--version")
-        .action([&](const auto & /*unused*/) {
-            std::cout << FASTBQSR_VERSION << std::endl;
-            std::exit(0);
-        })
-        .default_value(false)
-        .help("prints version information and exits")
-        .implicit_value(true)
-        .nargs(0);
-
-    // std::cout << program << std::endl;
-
-    nsgv::gBqsrArg.START_TIME = get_current_time_str();
-    nsgv::gBqsrArg.CLI_STR = argv[0];
-    for (int i = 1; i < argc; ++i) {
-        nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
-    }
-
+    addHelpVersion(program);
    try {
        program.parse_args(argc, argv);
-        nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
-        nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
-        nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
-        if (nsgv::gBqsrArg.NUM_THREADS < 1) {
-            spdlog::error("num-threads must be positive.");
-            exit(1);
-        }
+        parseCommonArgs(program);
+        nsgv::gBqsrArg.BQSR_FILE = program.get("--bqsr");
        nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
-        nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
-        nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
-        nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
-        // spdlog::info("known sites vcf files:");
-        // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
-        //     spdlog::info("  {}", ks);
-        // }
+        string idxFormat = program.get("--index-format");
+        std::transform(idxFormat.begin(), idxFormat.end(), idxFormat.begin(), ::toupper);
+        nsgv::gBqsrArg.INDEX_FORMAT = idxFormat == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;

-        nsgv::gBqsrArg.INDEX_FORMAT =
-            program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
-
-    } catch (const std::exception &err) {
+    } catch (const std::exception& err) {
        spdlog::error(err.what());
        return 1;
    }

-    spdlog::info("fast base recalibration phase-1 start");
-    BaseRecalibrator();
-    spdlog::info("fast base recalibration phase-1 end");
+    spdlog::info("fast base recalibration phase-2 start");
+    ApplyBQSR();
+    spdlog::info("fast base recalibration phase-2 end");

-    DisplayProfiling(nsgv::gBqsrArg.NUM_THREADS);
+    // DisplayProfilingApplyBQSR(nsgv::gBqsrArg.NUM_THREADS);
+    DisplayProfilingApplyBQSR(1);

    return 0;
 }

-int main_ApplyBQSR(int argc, char* argv[]) { return 0; }
-
 int main(int argc, char *argv[]) {
    // init log
    spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr"));
    spdlog::cfg::load_env_levels();

+    addTimeCLI(argc, argv);
    string bqsr_prog = argv[1];
    if (bqsr_prog == "BaseRecalibrator") {
        return main_BaseRecalibrator(argc - 1, argv + 1);
--- a/src/util/profiling.cpp
+++ b/src/util/profiling.cpp
@ -45,8 +45,7 @@ static int CalcThreadTime(uint64_t *a, int len, double *max, double *min, double
        fprintf(stderr, "time %-15s:  avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \
    }

-int DisplayProfiling(int nthread) {
-
+int DisplayProfilingBQSR(int nthread) {
 #ifdef SHOW_PERF
    fprintf(stderr, "\n");
    PRINT_GP(GP_read);
@ -80,6 +79,27 @@ int DisplayProfiling(int nthread) {
    }
    PRINT_GP(GP_whole_process);

+    fprintf(stderr, "\n");
+#endif
+
+    return 0;
+}
+
+int DisplayProfilingApplyBQSR(int nthread) {
+#ifdef SHOW_PERF
+    fprintf(stderr, "\n");
+    PRINT_GP(GP_read);
+    if (nthread == 1) {
+        PRINT_GP(GP_covariate);
+    } else {
+        PRINT_TP(TP_covariate);
+        PRINT_TP(TP_readgroup);
+        PRINT_TP(TP_qualityscore);
+        PRINT_TP(TP_context);
+        PRINT_TP(TP_cycle);
+    }
+    PRINT_GP(GP_whole_process);
+
    fprintf(stderr, "\n");
 #endif

--- a/src/util/profiling.h
+++ b/src/util/profiling.h
@ -79,7 +79,8 @@ enum {

 uint64_t RealtimeMsec(void);

-int DisplayProfiling(int);
+int DisplayProfilingBQSR(int);
+int DisplayProfilingApplyBQSR(int);

 #ifdef __cplusplus
 }
--- a/src/util/report_table.h
+++ b/src/util/report_table.h
@ -22,24 +22,46 @@ using std::string;
 using std::stringstream;
 using std::vector;

-#define REPORT_HEADER_VERSION "#:GATKReport.v1.1:5"
+#define REPORT_HEADER_VERSION "#:GATKReport.v1.1"
+#define REPORT_HEADER_MINOR_VERSION "5"

 struct ReportUtil {
-    static string ToString(const bool val) { return val ? "true" : "false"; }
-    static string ToString(const char val) {
+    static inline string ToString(const bool val) { return val ? "true" : "false"; }
+    static inline string ToString(const char val) {
        string s = "";
        s += val;
        // spdlog::info("char: {}, str: {}", val, s);
        return s;
    }
-    static string ToString(const string& val) { return val == "" ? "null" : val; }
-    static string ToString(const double val, int precise) {
+    static inline string ToString(const string& val) { return val == "" ? "null" : val; }
+    static inline string ToString(const double val, int precise) {
        stringstream ss;
        ss << std::fixed << std::setprecision(precise) << val;
        return ss.str();
    }
    template<typename T>
-    static string ToString(const T val) { return std::to_string(val); }
+    static inline string ToString(const T val) { return std::to_string(val); }
+
+    // 转换成bool
+    static inline bool ToBool(const string &val) {
+        if (val == "false" || val == "False" || val == "FALSE" || val.empty())
+            return false;
+        return true;
+    }
+
+    static inline string ParseString(const string &val) {
+        if (val == "null")
+            return "";
+        return val;
+    }
+
+    static inline int ToInt(const string& val) { return std::stoi(val); }
+
+    static inline double ToDouble(const string& val) { return std::stod(val); }
+
+    static inline int64_t ToInt64(const string& val) { return std::stoll(val); }
+
+    static inline uint64_t ToUint64(const string& val) { return std::stoull(val); }
 };

 struct ReportTable {
--- a/src/util/sam_data.h
+++ b/src/util/sam_data.h
@ -54,7 +54,8 @@ struct SamData {
    StableArray<uint8_t> base_quals;  // 对应的质量分数
    StableArray<uint8_t> ins_quals;   // insert质量分数, BI (大部分应该都没有)
    StableArray<uint8_t> del_quals;   // delete质量分数, BD (大部分应该都没有)
-    StableArray<Cigar> cigars;
+    StableArray<Cigar> cigars; // clip 之后的cigar
+    StableArray<uint8_t> recaled_quals; // 保存校正之后的质量分数

    // 用作临时buffer
    StableArray<char> strandedClippedBases; // for context covariate
@ -87,6 +88,20 @@ struct SamData {
        end_pos = 0;
    }

+    // 解析apply bqsr里用到的信息
+    void parseForApplyBQSR(BamWrap *_bw) {
+        bw = _bw;
+        read_len = bw->b->core.l_qseq;
+        bases.resize(read_len);
+        base_quals.resize(read_len);
+        uint8_t* seq = bam_get_seq(bw->b);
+        uint8_t* quals = bam_get_qual(bw->b);
+        for (int i = 0; i < read_len; ++i) {
+            bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i)];
+            base_quals[i] = quals[i];
+        }
+    }
+
    // 初步解析bam
    void parseBasic(BamWrap *_bw) {
        bw = _bw;