重构了一下，大数据问题还没解决

2025-12-29 19:36:38 +08:00 · 2025-12-29 19:36:38 +08:00 · 1e5a291eb7
parent 1864736509
commit 1e5a291eb7
15 changed files with 483 additions and 462 deletions
--- a/src/bqsr/baq.cpp
+++ b/src/bqsr/baq.cpp
@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
 vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1)));  // [ref][read][qual]
 // 计算baq数组，返回成功与否
-bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
+bool BAQ::calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
    // 检测ref是否覆盖了read
    if (ref.size() < refOffset + ad.read_len) {
        spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
--- a/src/bqsr/baq.h
+++ b/src/bqsr/baq.h
@ -15,6 +15,7 @@
 #include <vector>
 #include "util/bam_wrap.h"
 #include "util/sam_data.h"
 using std::vector;
 using std::string;
@ -84,5 +85,5 @@ struct BAQ {
    double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
    // 计算baq数组，返回成功与否
-    bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
+    bool calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray);
 };
--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -39,6 +39,10 @@ Date : 2023/10/23
 #include "util/math/math_utils.h"
 #include "quant_info.h"
 #include "util/debug.h"
 #include "util/stable_array.h"
 #include "util/sam_data.h"
 #include "util/read_transformer.h"
 #include "util/base_utils.h"
 using std::deque;
@ -47,8 +51,6 @@ using std::deque;
 const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
 const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
 // 解析knownSites
 struct VCFParser {
    deque<Interval> knownSites;  // 已知的变异位点
@ -157,222 +159,6 @@ bool bqsrReadFilterOut(const bam1_t *b) {
    return false;
 }
 // 该操作符是否消耗read的碱基
 bool consumeReadBases(char cigar) {
    return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
 }
 // 该操作符是否消耗参考基因组的碱基
 bool consumeRefBases(char cigar) {
    return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
 }
 // 给定一个ref位置，在read内部找到对应的位置和操作符
 struct PosAndOperator {
    int readPosAtRefCoord = -1;  // read中的位置
    char cigarOperator = '0';     // cigar操作符
    int cigarIndex = -1;        // cigar索引
    int cigarLen = 0;
    int preCigarLen = 0;  // 截止cigar之前的，消耗read base的长度
 };
 /**
 * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
 * the element containing that base.  If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
 * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
 *
 * @param alignmentStart        The soft start of the read on the reference
 * @param cigar                 The read's cigar
 * @param refCoord              The target reference coordinate
 * @return                      If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
 *                              if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
 *                              if the reference coordinate falls within a deletion, the first read coordinate after the deletion.  Note: if the last
 * cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
 */
 PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
    PosAndOperator po;
    if (refCoord < alignmentStart) {
        return po;
    }
    int firstReadPosOfElement = 0;              // inclusive
    int firstRefPosOfElement = alignmentStart;  // inclusive
    int lastReadPosOfElement = 0;               // exclusive
    int lastRefPosOfElement = alignmentStart;   // exclusive
    // advance forward through all the cigar elements until we bracket the reference coordinate
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    const int idx = bc.n_cigar - 1;
    if (idx < 0)
        return po;
    for (int i = 0; i < bc.n_cigar; ++i) {
        const char c = bam_cigar_opchr(cigar[i]);
        const int len = bam_cigar_oplen(cigar[i]);
        firstReadPosOfElement = lastReadPosOfElement;
        firstRefPosOfElement = lastRefPosOfElement;
        lastReadPosOfElement += consumeReadBases(c) ? len : 0;
        lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
        if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) {  // refCoord falls within this cigar element
            int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
            return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
        }
    }
    return po;
 }
 // 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
 void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) {
    int start, stop;
    // Determine the read coordinate to start and stop hard clipping
    if (refStart < 0) {
        if (refStop < 0) return;
        PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
        // if the refStop falls in a deletion, the above method returns the position after the deletion.  Since the stop we return here
        // is inclusive, we decrement the stop to avoid overclipping by one base.  As a result we do not clip the deletion, which is fine.
        stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
        sd.left_clip = stop + 1;
        sd.cigar_start = stopPosAndOperator.cigarIndex;
        sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
    } else {
        if (refStop >= 0) return;
        // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
        // overclipping
        PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
        start = startPosAndOperator.readPosAtRefCoord;
        sd.right_clip = bw->b->core.l_qseq - start;
        sd.cigar_end = startPosAndOperator.cigarIndex + 1;
        sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
    }
 }
 // 计算切掉adapter之后，ref相对原始ref的偏移量
 void calculateRefOffset(BamWrap *bw, SamData &sd) {
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int i = 0;
    for (i = 0; i < sd.cigar_start; ++i) {
        const char c = bam_cigar_opchr(cigar[i]);
        int len = bam_cigar_oplen(cigar[i]);
        if (consumeRefBases(c)) {
            sd.ref_offset += len;
        }
    }
    const char c = bam_cigar_opchr(cigar[i]);
    if (consumeRefBases(c)) {
        sd.ref_offset += sd.first_cigar_clip;
    }
 }
 // 计算clip处理之后，剩余的碱基
 void calculateReadBases(BamWrap* bw, SamData& sd) {
    sd.bases.resize(sd.read_len);
    sd.base_quals.resize(sd.read_len);
    uint8_t* seq = bam_get_seq(bw->b);
    uint8_t* quals = bam_get_qual(bw->b);
    for (int i = 0; i < sd.read_len; ++i) {
        sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)];
        sd.base_quals[i] = quals[i + sd.left_clip];
    }
 }
 // 计算read两端clip之后的softstart和softend
 void calculateSoftStartEnd(BamWrap* bw, SamData& sd) {
    int64_t softStart = bw->b->core.pos + sd.ref_offset;
    int64_t softEnd = softStart - 1; // 闭区间
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int cigar_start = sd.cigar_start;
    int cigar_end = sd.cigar_end;
    bool rightTail = false;
    for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
        const char c = bam_cigar_opchr(cigar[i]);
        int len = bam_cigar_oplen(cigar[i]);
        if (i == sd.cigar_start) len -= sd.first_cigar_clip;
        if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
        // if (c == 'S' || c == 'I')
        if (c == 'S')
            softStart -= len;
        else if (c != 'H')
            rightTail = true;
        if (rightTail) {
            if (consumeRefBases(c) || c == 'S')
                softEnd += len;
        }
    }
    sd.softStart() = softStart;
    sd.softEnd() = softEnd;
 }
 // 计算clip之后的cigar
 void calculateCigar(BamWrap* bw, SamData& sd) {
    sd.cigars.clear();
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int cigar_start = sd.cigar_start;
    int cigar_end = sd.cigar_end;
    for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
        char c = bam_cigar_opchr(cigar[i]);
        int len = bam_cigar_oplen(cigar[i]);
        if (i == sd.cigar_start)
            len -= sd.first_cigar_clip;
        if (i == sd.cigar_end - 1)
            len -= sd.last_cigar_clip;
        //if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
        if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion
            c = 'H';
            // sd.ref_offset += len;
            sd.start_pos += len;
        }
        sd.cigars.push_back({c, len});
    }
    //for(auto &cigar : sd.cigars) {
    //    spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
    //}
 }
 // 计算read两端softclip的碱基数量，可能会修改ad里的clip值
 void calculateSoftClip(BamWrap *bw, SamData &sd) {
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int readIndex = sd.left_clip;
    int cutLeft = -1;           // first position to hard clip (inclusive)
    int cutRight = -1;          // first position to hard clip (inclusive)
    int cigar_start = sd.cigar_start;
    int cigar_end = sd.cigar_end;
    bool rightTail = false;  // trigger to stop clipping the left tail and start cutting the right tail
    for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
        const char c = bam_cigar_opchr(cigar[i]);
        int len = bam_cigar_oplen(cigar[i]);
        if (i == sd.cigar_start) len -= sd.first_cigar_clip;
        if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
        if (c == 'S') {
            if (rightTail) {
                cutRight = readIndex;
                cigar_end = i;
            } else {
                cutLeft = readIndex + len - 1;
                cigar_start = i + 1;
            }
        } else if (c != 'H') {
            rightTail = true;
        }
        if (consumeReadBases(c)) {
            readIndex += len;
        }
    }
    if (cutRight >= 0) {
        sd.right_clip = bw->b->core.l_qseq - cutRight;
        sd.cigar_end = cigar_end;
        sd.last_cigar_clip = 0;
    }
    if (cutLeft >= 0) {
        sd.left_clip = cutLeft + 1;
        sd.cigar_start = cigar_start;
        sd.first_cigar_clip = 0;
    }
 }
 // 读取给定区间的reference
 static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
    if (aux.ref_seq != NULL)
@ -386,22 +172,23 @@ static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interva
 }
 // 设置某个位置是indel
-inline void updateIndel(vector<int> &isIndel, int index) {
+inline void updateIndel(StableArray<int>& isIndel, int index) {
    if (index >=0 && index < isIndel.size()) {
        isIndel[index] = 1;
    }
 }
 // 计算该read的每个碱基位置是否是SNP或Indel
-int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
+int calculateIsSNPOrIndel(AuxVar& aux, SamData& sd, StableArray<int>& isSNP, StableArray<int>& isIns, StableArray<int>& isDel) {
    isSNP.resize(sd.read_len, 0);
    isIns.resize(sd.read_len, 0);
    isDel.resize(sd.read_len, 0);
    // 1. 读取参考基因组，先看看串行运行性能，稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
    //Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间
    Interval interval{sd.start_pos, sd.end_pos};  // 闭区间
    PROF_START(ref);
    read_ref_base(aux, interval.left, interval);
    PROF_END(gprof[GP_read_ref], ref);
    string refBases(aux.ref_seq);
    // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
    // 2. 遍历cigar，计算每个碱基是否是SNP或Indel
    int readPos = 0, refPos = 0, nEvents = 0;
@ -412,7 +199,6 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
            for (int j = 0; j < len; ++j) {
                // 按位置将read和ref碱基进行比较，不同则是snp，注意read起始位置要加上left_clip
                int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1;
                // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
                isSNP[readPos] = snpInt;
                nEvents += snpInt;
                readPos++;
@ -439,59 +225,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
            readPos += len;
        }
    }
    // const uint32_t* cigar = bam_get_cigar(bw->b);
    // const bam1_core_t& bc = bw->b->core;
    // uint8_t* seq = bam_get_seq(bw->b);
    // for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
    //     const char c = bam_cigar_opchr(cigar[i]);
    //     int len = bam_cigar_oplen(cigar[i]);
    //     if (i == sd.cigar_start) len -= sd.first_cigar_clip;
    //     if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
    //     if (c == 'M' || c == '=' || c == 'X') {
    //         for (int j = 0; j < len; ++j) {
    //             // 按位置将read和ref碱基进行比较，不同则是snp，注意read起始位置要加上left_clip
    //             int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1;
    //             // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
    //             isSNP[readPos] = snpInt;
    //             nEvents += snpInt;
    //             readPos++;
    //             refPos++;
    //         }
    //     } else if (c == 'D') {
    //         // 应该是在上一个消耗碱基的cigar的最后一个位置，标记Del
    //         int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
    //         updateIndel(isDel, index);
    //         refPos += len;
    //     } else if (c == 'N') {
    //         refPos += len;
    //     } else if (c == 'I') {
    //         // 与Del不同，Ins应该是在下一个cigar开始的位置，标记Ins
    //         bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
    //         if (forwardStrandRead) {
    //             updateIndel(isIns, readPos - 1);
    //         }
    //         readPos += len;
    //         if (!forwardStrandRead) {
    //             updateIndel(isIns, readPos);
    //         }
    //     } else if (c == 'S') {
    //         readPos += len;
    //     }
    // }
    nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
    // spdlog::info("nEvents: {}", nEvents);
    //spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
    //             std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
    // exit(0);
    return nEvents;
 }
 // 简单计算baq数组，就是全部赋值为'@' (64)
-bool flatBAQArray(SamData& sd, vector<uint8_t>& baqArray) {
+bool flatBAQArray(SamData& sd, StableArray<uint8_t>& baqArray) {
    baqArray.resize(sd.read_len, (uint8_t)'@');
    return true;
 }
@ -514,12 +254,13 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
 }
 // 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
-void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
+void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, StableArray<uint8_t>& knownSites) {
    BamWrap* bw = sd.bw;
    int tid = bw->contig_id();
    uint64_t startPos = bw->start_pos();   // 闭区间
    uint64_t endPos = bw->end_pos(); // 闭区间
-    // spdlog::info("bam {}, {}", startPos, endPos);
+    knownSites.resize(sd.read_len, 0);
    // update vcfs
    for(auto &vcf : vcfs) {
        // 清理旧的interval
@ -533,15 +274,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
        }
        if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
        // spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
        // exit(0);
        //spdlog::info("before intervals : {}", vcf.knownSites.size());
        // 读取新的interval
        int64_t fpos, flen;
        endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
        Interval readIntv(startPos, endPos);
        vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
        //spdlog::info("file index: {}, {}", fpos, flen);
        if (flen > 0) {
            vcf.inStm.seekg(fpos, ios::beg);
            if (flen > vcf.bufLen) {
@ -565,20 +302,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
                Interval varIntv(varStart, varStart + ref.size() - 1);
                if (readIntv.overlaps(varIntv)) {
                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
                    //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
                }
                get_line_from_buf(buf, flen, &cur, &line);
            }
        }
        //spdlog::info("after intervals : {}", vcf.knownSites.size());
        //for(auto &val : vcf.knownSites) {
        //    spdlog::info("intv {}, {}", val.left, val.right);
        //}
    }
    //exit(0);
    knownSites.resize(sd.read_len);
    endPos = bw->end_pos();
    for(auto &vcf : vcfs) {
        for (auto &intv : vcf.knownSites) {
            // knownSite is outside clipping window for the read, ignore
@ -602,11 +330,10 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
            }
        }
    }
 }
 // 应该是计算一段数据的平均值
-static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
+static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, StableArray<int>& errorArr, StableArray<double>& fracErrs) {
    int totalErrors = 0;
    for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
        totalErrors += errorArr[j];
@ -617,9 +344,8 @@ static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<in
 }
 // 应该是用来处理BAQ的，把不等于特定BAQ分数的碱基作为一段数据统一处理
-void calculateFractionalErrorArray(vector<int>& errorArr, vector<uint8_t>& baqArr, vector<double>& fracErrs) {
+void calculateFractionalErrorArray(StableArray<int>& errorArr, StableArray<uint8_t>& baqArr, StableArray<double>& fracErrs) {
-    // for (auto val : errorArr) { if (val > 0) spdlog::info("snp err val: {}", val); }
+    fracErrs.resize(baqArr.size(), 0.0);
    fracErrs.resize(baqArr.size());
    // errorArray和baqArray必须长度相同
    const int BLOCK_START_UNSET = -1;
    bool inBlock = false;
@ -805,11 +531,8 @@ int SerialBQSR() {
        ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
    }
-    int test = 0;
+    while (true) {
    while (1) {
        ++ round;
        // 一. 读取bam数据
        size_t readNum = 0;
        if (inBamBuf.ReadStat() >= 0)
@ -818,52 +541,38 @@ int SerialBQSR() {
            break;
        }
        auto bams = inBamBuf.GetBamArr();
-        spdlog::info("{} reads processed in {} round, {}", readNum, round, test);
+        spdlog::info("{} reads processed in {} round", readNum, round);
        // 二. 遍历每个bam（read）记录，进行处理
        SamData sd;
        StableArray<int> isSNP, isIns, isDel;  // 该位置是否是SNP, indel位置，0不是，1是
        StableArray<uint8_t> baqArray;
        StableArray<double> snpErrors, insErrors, delErrors;
        StableArray<uint8_t> skips;  // 该位置是否是已知位点
        for (int i = 0; i < bams.size(); ++i) {
            // 1. 对每个read，需要检查cigar是否合法，即没有两个连续的相同的cigar，而且需要将首尾的deletion处理掉，目前看好像没啥影响，我们忽略这一步
            // 2. 对质量分数长度跟碱基长度不匹配的read，缺少的质量分数用默认值补齐，先忽略，后边有需要再处理
            // 3. 如果bam文件之前做过bqsr，tag中包含OQ（originnal quality，原始质量分数），检查用户参数里是否指定用原始质量分数进行bqsr，如果是则将质量分数替换为OQ，否则忽略OQ，先忽略
            // 4. 对read的两端进行检测，去除（hardclip）adapter
            // spdlog::info("bam idx: {}", i);
            BamWrap* bw = bams[i];
-            SamData sd;
+            sd.init();
            sd.parseBasic(bw);
            sd.rid = i + readNumSum;
            sd.bw = bw;
            sd.read_len = BamWrap::BamEffectiveLength(bw->b);
            sd.cigar_end = bw->b->core.n_cigar;
            if (sd.read_len <= 0) continue;
-            int adapter_boundary = bw->GetAdapterBoundary();
+            PROF_START(clip_read);
-            if (bw->IsAdapterInRead(adapter_boundary)) {
+            // 4. 对read的两端进行检测，去除（hardclip）adapter
-                // adapter在read范围内
+            ReadTransformer::hardClipAdaptorSequence(bw, sd); 
-                if (bw->GetReadNegativeStrandFlag()) { // 反链
+            if (sd.read_len <= 0) continue;
                    clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
                } else { // 正链
                    clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
                }
            }
            sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
            // 5. 然后再去除softclip部分
-            calculateSoftClip(bw, sd);
+            ReadTransformer::hardClipSoftClippedBases(bw, sd);
            sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip;  // 更新read长度
            if (sd.read_len <= 0) continue;
-
+            // 应用所有的变换，计算samdata的相关信息
-            calculateRefOffset(bw, sd); // 计算ref_offset，就是相对比对的position，要将ref右移多少
+            sd.applyTransformations();
-            calculateReadBases(bw, sd);  // 计算clip处理之后，剩余的碱基
+            PROF_END(gprof[GP_clip_read], clip_read);
            // 计算clip之后，两端的softstart和softend
            calculateSoftStartEnd(bw, sd);
            calculateCigar(bw, sd);
            //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
            //             sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str());
            // 6. 更新每个read的platform信息，好像没啥用，暂时忽略
            vector<int> isSNP(sd.read_len, 0);     // 该位置是否是SNP位置，0不是，1是
            vector<int> isIns(sd.read_len, 0); // 该位置是否是插入位置，0不是，1是
            vector<int> isDel(sd.read_len, 0); // 该位置是否是删除位置，0不是，1是
            const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel);
            /*fprintf(gf[0], "%d\t", sd.read_len);
@ -877,16 +586,11 @@ int SerialBQSR() {
            fprintf(gf[2], "\n");
            */
            // spdlog::info("nErrors: {}", nErrors);
            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
            //exit(0);
            // 7. 计算baqArray
            // BAQ = base alignment quality
            // note for efficiency reasons we don't compute the BAQ array unless we actually have
            // some error to marginalize over.  For ILMN data ~85% of reads have no error
-            vector<uint8_t> baqArray;
+            // vector<uint8_t> baqArray;
            bool baqCalculated = false;
            if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
                baqCalculated = flatBAQArray(sd, baqArray);
@ -922,13 +626,7 @@ int SerialBQSR() {
            // }
            // fprintf(gf[3], "\n");
            //test = readCovariates[1][0][0] + readCovariates[2][1][3];
            //int end_pos = bw->contig_end_pos();
            //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
            //             bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
            // 9. 计算这条read需要跳过的位置
            vector<bool> skips(sd.read_len, 0);
            PROF_START(known_sites);
            calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips);
            for (int ii = 0; ii < sd.read_len; ++ii) {
@ -936,27 +634,19 @@ int SerialBQSR() {
                            sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
            }
            PROF_END(gprof[GP_read_vcf], known_sites);
-            // fprintf(gf[0], "%d\t", sd.read_len);
+            // fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len);
            // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
            // fprintf(gf[0], "\n");
            // 10. 根据BAQ进一步处理snp，indel，得到处理后的数据
            vector<double> snpErrors, insErrors, delErrors;
            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-2: {}", val); }
            calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
            calculateFractionalErrorArray(isIns, baqArray, insErrors);
            calculateFractionalErrorArray(isDel, baqArray, delErrors);
            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
            //spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len);
            //for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); }
            // aggregate all of the info into our info object, and update the data
            // 11. 合并之前计算的数据，得到info，并更新bqsr table数据
            ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
-            int m = 0;
+
            // for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); }
            //exit(0);
            PROF_START(update_info);
            updateRecalTablesForRead(info);
            PROF_END(gprof[GP_update_info], update_info);
--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -99,7 +99,7 @@ static char SimpleComplement(const char base) {
    }
 }
-static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
+static void ClipLowQualEndsWithN(string& bases, const StableArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
    // 处理左边
    int left = 0;
    int readLen = bases.size();
@ -245,7 +245,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
    const int originalReadLength = sd.read_len;
    // store the original bases and then write Ns over low quality ones
-    string strandedClippedBases(sd.bases);
+    string strandedClippedBases(sd.bases.arr.data(), sd.read_len);
    // GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
    GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail);  // 命名我之前看到过这个30的？
    // spdlog::info("bases: {}", strandedClippedBases);
--- a/src/bqsr/covariate.h
+++ b/src/bqsr/covariate.h
@ -19,6 +19,7 @@
 #include "bqsr_args.h"
 #include "util/bam_wrap.h"
 #include "util/sam_data.h"
 using std::map;
 using std::string;
--- a/src/bqsr/read_recal_info.h
+++ b/src/bqsr/read_recal_info.h
@ -9,19 +9,23 @@
 #pragma once
-#include "util/bam_wrap.h"
+#include <vector>
 #include "covariate.h"
 #include "util/bam_wrap.h"
 #include "util/stable_array.h"
 using std::vector;
 struct ReadRecalInfo {
    SamData& read;
    int length;
    PerReadCovariateMatrix& covariates;
-    vector<bool>& skips;
+    StableArray<uint8_t>& skips;
-    FastArray<uint8_t>&base_quals, &ins_quals, &del_quals;
+    StableArray<uint8_t>&base_quals, &ins_quals, &del_quals;
-    vector<double>&snp_errs, &ins_errs, &del_errs;
+    StableArray<double>&snp_errs, &ins_errs, &del_errs;
-    ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, vector<bool>& _skips, vector<double>& _snp_errs, vector<double>& _ins_errs,
+    ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, StableArray<uint8_t>& _skips, StableArray<double>& _snp_errs,
-                  vector<double>& _del_errs)
+                  StableArray<double>& _ins_errs, StableArray<double>& _del_errs)
        : read(_read),
          covariates(_covariates),
          skips(_skips),
--- a/src/util/bam_wrap.h
+++ b/src/util/bam_wrap.h
@ -21,111 +21,6 @@
 using namespace std;
 struct Cigar {
    char op = '0';
    int len = 0;
    // 该操作符是否消耗read的碱基
    static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
    // 该操作符是否消耗参考基因组的碱基
    static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
 };
 struct ReadIdxCigar {
    int readIdx = -1; // 在read序列中的位置
    char cigarOp = '0'; // 当前位置对应的cigar
 };
 // 不用经常释放array的内存空间，减少频繁的内存开辟和释放操作
 template <class T>
 struct FastArray {
    vector<T> arr;
    size_t idx;
    void clear() { idx = 0; }
    size_t size() { return idx; }
    bool empty() { return idx == 0; }
    void reserve(size_t _size) { arr.reserve(_size); }
    void resize(size_t _size) {
        arr.resize(_size);
        idx = _size;
    }
    void push_back(const T& val) {
        if (idx < arr.size()) {
            arr[idx++] = val;
        } else {
            arr.push_back(val);
            idx++;
        }
    }
    inline T& operator[](size_t pos) { return arr[pos]; }
    inline const T& operator[](size_t pos) const { return arr[pos]; }
    struct iterator {
        typename std::vector<T>::iterator it;
        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
        iterator& operator++() { ++it; return *this;}
        iterator& operator--() { --it; return *this;}
        T& operator*() const { return *it; }
        bool operator!=(const iterator& other) const { return it != other.it; }
        bool operator==(const iterator& other) const { return it == other.it; }
    };
    iterator begin() { return arr.begin(); }
    iterator end() { return arr.begin() + idx; }
 };
 // 对原始bam数据的补充，比如对两端进行hardclip等
 class BamWrap;
 struct SamData {
    int64_t rid = 0; // for debug
    int read_len = 0;          // read长度，各种clip之后的长度
    int cigar_start = 0;       // cigar起始位置，闭区间
    int cigar_end = 0;         // cigar结束位置，开区间
    int first_cigar_clip = 0;  // 第一个cigar, clip的数量，切左侧
    int last_cigar_clip = 0;   // 最后一个cigar, clip的数量，切右侧
    int left_clip = 0;         // 左侧被切掉的碱基长度，BI和BD质量分数也会用到
    int right_clip = 0;        // 右侧被切掉的碱基长度
    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
    // 记录一下bqsr运算过程中用到的数据，回头提前计算一下，修正现在的复杂逻辑
    static constexpr int READ_INDEX_NOT_FOUND = -1;
    BamWrap* bw;
    int64_t start_pos;      // 因为soft clip都被切掉了，这里的softstart应该就是切掉之后的匹配位点，闭区间
    int64_t end_pos;        // 同上，闭区间
    string bases;           // 处理之后的read的碱基
    FastArray<uint8_t> base_quals;  // 对应的质量分数
    FastArray<uint8_t> ins_quals;  // insert质量分数, BI (大部分应该都没有)
    FastArray<uint8_t> del_quals;      // delete质量分数, BD (大部分应该都没有)
    FastArray<Cigar> cigars;
    int64_t& softStart() { return start_pos; }
    int64_t& softEnd() { return end_pos; }
    // functions
    ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
        ReadIdxCigar rc;
        if (refPos < start_pos)
            return rc;
        int firstReadPosOfElement = 0;         // inclusive
        int firstRefPosOfElement = start_pos;  // inclusive
        int lastReadPosOfElement = 0;          // exclusive
        int lastRefPosOfElement = start_pos;   // exclusive
        // advance forward through all the cigar elements until we bracket the reference coordinate
        for (auto& cigar : cigars) {
            firstReadPosOfElement = lastReadPosOfElement;
            firstRefPosOfElement = lastRefPosOfElement;
            lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
            lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
            if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) {  // refCoord falls within this cigar element
                int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
                rc.cigarOp = cigar.op;
                rc.readIdx = readPosAtRefCoord;
                return rc;
            }
        }
        return rc;
    }
 };
 /*
    这里的成员函数命名有点混乱，特此说明，小写加下划线的函数命名，无论是静态函数，还是普通成员函数，更侧重说明
    这是类似bam的一个属性，而大写加驼峰命名的函数，更侧重说明这是通过计算得出的。
--- a/src/util/base_utils.cpp
+++ b/src/util/base_utils.cpp
@ -0,0 +1,3 @@
 #include "base_utils.h"
 const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
--- a/src/util/base_utils.h
+++ b/src/util/base_utils.h
@ -0,0 +1,20 @@
 /*
     Description: 碱基相关的工具函数
     Copyright : All right reserved by ICT
     Author : Zhang Zhonghai
     Date : 2025/12/29
 */
 #pragma once
 struct BaseUtils {
    // uint8_t转碱基字符
    static const char cBaseToChar[16];
    // 该操作符是否消耗read的碱基
    static bool consumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
    // 该操作符是否消耗参考基因组的碱基
    static bool consumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
 };
--- a/src/util/profiling.cpp
+++ b/src/util/profiling.cpp
@ -54,6 +54,7 @@ int DisplayProfiling(int nthread) {
    // PRINT_GP(sort_wait);
    // PRINT_GP(markdup_wait);
    // PRINT_GP(intersect_wait);
    PRINT_GP(clip_read);
    PRINT_GP(read_ref);
    PRINT_GP(read_vcf);
    PRINT_GP(covariate);
--- a/src/util/profiling.h
+++ b/src/util/profiling.h
@ -40,6 +40,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
 enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
 enum {
    GP_read_wait = 11,
    GP_clip_read,
    GP_covariate,
    GP_read_ref,
    GP_read_vcf,
--- a/src/util/read_transformer.h
+++ b/src/util/read_transformer.h
@ -0,0 +1,154 @@
 /*
     Description: read (sam record) 相关的工具函数，比如用于clipping低质量碱基等
     Copyright : All right reserved by ICT
     Author : Zhang Zhonghai
     Date : 2025/12/29
 */
 #pragma once
 #include "bam_wrap.h"
 #include "sam_data.h"
 #include "base_utils.h"
 // 用于对read进行各种转换操作，比如clipping等。注意这里都是逻辑操作，最后需要调用SamData.applyTransformations()来真正应用这些修改
 struct ReadTransformer {
    // 给定一个ref位置，在read内部找到对应的位置和操作符
    struct PosAndOperator {
        int readPosAtRefCoord = -1;  // read中的位置
        char cigarOperator = '0';    // cigar操作符
        int cigarIndex = -1;         // cigar索引
        int cigarLen = 0;
        int preCigarLen = 0;  // 截止cigar之前的，消耗read base的长度
    };
    /**
     * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
     * the element containing that base.  If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
     * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
     *
     * @param alignmentStart        The soft start of the read on the reference
     * @param cigar                 The read's cigar
     * @param refCoord              The target reference coordinate
     * @return                      If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
     *                              if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
     * coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion.  Note: if the last cigar element
     * is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
     */
    static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
        PosAndOperator po;
        if (refCoord < alignmentStart) {
            return po;
        }
        int firstReadPosOfElement = 0;              // inclusive
        int firstRefPosOfElement = alignmentStart;  // inclusive
        int lastReadPosOfElement = 0;               // exclusive
        int lastRefPosOfElement = alignmentStart;   // exclusive
        // advance forward through all the cigar elements until we bracket the reference coordinate
        const uint32_t* cigar = bam_get_cigar(bw->b);
        const bam1_core_t& bc = bw->b->core;
        const int idx = bc.n_cigar - 1;
        if (idx < 0)
            return po;
        for (int i = 0; i < bc.n_cigar; ++i) {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            firstReadPosOfElement = lastReadPosOfElement;
            firstRefPosOfElement = lastRefPosOfElement;
            lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
            lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
            if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) {  // refCoord falls within this cigar element
                int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
                return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
            }
        }
        return po;
    }
    // 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
    static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
        int start, stop;
        // Determine the read coordinate to start and stop hard clipping
        if (refStart < 0) {
            if (refStop < 0)
                return;
            PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
            // if the refStop falls in a deletion, the above method returns the position after the deletion.  Since the stop we return here
            // is inclusive, we decrement the stop to avoid overclipping by one base.  As a result we do not clip the deletion, which is fine.
            stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
            sd.left_clip = stop + 1;
            sd.cigar_start = stopPosAndOperator.cigarIndex;
            sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
        } else {
            if (refStop >= 0)
                return;
            // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
            // avoids overclipping
            PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
            start = startPosAndOperator.readPosAtRefCoord;
            sd.right_clip = bw->b->core.l_qseq - start;
            sd.cigar_end = startPosAndOperator.cigarIndex + 1;
            sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
        }
    }
    // 切掉adaper序列，注意这里的clipping只是逻辑上的，实际并没有修改bam record
    static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
        int adapter_boundary = bw->GetAdapterBoundary();
        if (bw->IsAdapterInRead(adapter_boundary)) {
            // adapter在read范围内
            if (bw->GetReadNegativeStrandFlag()) {  // 反链
                clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
            } else {  // 正链
                clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
            }
        }
        sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip;  // 更新read长度
    }
    // 计算read两端softclip的碱基数量，切掉softclip序列
    static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
        const uint32_t* cigar = bam_get_cigar(bw->b);
        const bam1_core_t& bc = bw->b->core;
        int readIndex = sd.left_clip;
        int cutLeft = -1;   // first position to hard clip (inclusive)
        int cutRight = -1;  // first position to hard clip (inclusive)
        int cigar_start = sd.cigar_start;
        int cigar_end = sd.cigar_end;
        bool rightTail = false;  // trigger to stop clipping the left tail and start cutting the right tail
        for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
            const char c = bam_cigar_opchr(cigar[i]);
            int len = bam_cigar_oplen(cigar[i]);
            if (i == sd.cigar_start) len -= sd.first_cigar_clip;
            if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
            if (c == 'S') {
                if (rightTail) {
                    cutRight = readIndex;
                    cigar_end = i;
                } else {
                    cutLeft = readIndex + len - 1;
                    cigar_start = i + 1;
                }
            } else if (c != 'H') {
                rightTail = true;
            }
            if (BaseUtils::consumeReadBases(c)) {
                readIndex += len;
            }
        }
        if (cutRight >= 0) {
            sd.right_clip = bw->b->core.l_qseq - cutRight;
            sd.cigar_end = cigar_end;
            sd.last_cigar_clip = 0;
        }
        if (cutLeft >= 0) {
            sd.left_clip = cutLeft + 1;
            sd.cigar_start = cigar_start;
            sd.first_cigar_clip = 0;
        }
        sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip;  // 更新read长度
    }
 };
--- a/src/util/read_utils.h
+++ b/src/util/read_utils.h
--- a/src/util/sam_data.h
+++ b/src/util/sam_data.h
@ -0,0 +1,178 @@
 /*
     Description: 对原始bam进行部分数据解析，以更方便后续处理
     Copyright : All right reserved by ICT
     Author : Zhang Zhonghai
     Date : 2025/12/29
 */
 #pragma once
 #include <cstdint>
 #include "bam_wrap.h"
 #include "stable_array.h"
 #include "base_utils.h"
 // 对cigar进行简单包装
 struct Cigar {
    char op = '0';
    int len = 0;
    // 该操作符是否消耗read的碱基
    static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
    // 该操作符是否消耗参考基因组的碱基
    static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
 };
 // 在read中某个特定位置对应的cigar操作符
 struct ReadIdxCigar {
    int readIdx = -1;    // 在read序列中的位置
    char cigarOp = '0';  // 当前位置对应的cigar
 };
 // 对原始bam进行部分数据解析，以更方便后续处理
 struct SamData {
    int64_t rid = 0;           // for debug
    // 这些数据用于最开始clipping等操作
    int read_len = 0;          // read长度，各种clip之后的长度
    int cigar_start = 0;       // cigar起始位置，闭区间
    int cigar_end = 0;         // cigar结束位置，开区间
    int first_cigar_clip = 0;  // 第一个cigar, clip的数量，切左侧
    int last_cigar_clip = 0;   // 最后一个cigar, clip的数量，切右侧
    int left_clip = 0;         // 左侧被切掉的碱基长度，BI和BD质量分数也会用到
    int right_clip = 0;        // 右侧被切掉的碱基长度
    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
    // 记录一下bqsr运算过程中用到的数据，回头提前计算一下，修正现在的复杂逻辑
    static constexpr int READ_INDEX_NOT_FOUND = -1;
    BamWrap* bw;
    int64_t start_pos;              // 因为soft clip都被切掉了，这里的softstart应该就是切掉之后的匹配位点，闭区间
    int64_t end_pos;                // 同上，闭区间
    StableArray<char> bases; // 处理之后的read的碱基数组形式
    StableArray<uint8_t> base_quals;  // 对应的质量分数
    StableArray<uint8_t> ins_quals;   // insert质量分数, BI (大部分应该都没有)
    StableArray<uint8_t> del_quals;   // delete质量分数, BD (大部分应该都没有)
    StableArray<Cigar> cigars;
    int64_t& softStart() { return start_pos; }
    int64_t& softEnd() { return end_pos; }
    // functions
    // 初始化
    void init() {
        bases.clear();
        base_quals.clear();
        ins_quals.clear();
        del_quals.clear();
        cigars.clear();
        rid = 0;
        read_len = 0;
        cigar_start = 0;
        cigar_end = 0;
        first_cigar_clip = 0;
        last_cigar_clip = 0;
        left_clip = 0;
        right_clip = 0;
        ref_offset = 0;
        bw = nullptr;
        start_pos = 0;
        end_pos = 0;
    }
    // 初步解析bam
    void parseBasic(BamWrap *_bw) {
        bw = _bw;
        read_len = BamWrap::BamEffectiveLength(bw->b);
        cigar_end = bw->b->core.n_cigar;
    }
    // 应用各种转换操作，更新SamData中的数据
    void applyTransformations() {
        const uint32_t* cigar = bam_get_cigar(bw->b);
        const bam1_core_t& bc = bw->b->core;
        int i = 0;
        // 计算ref_offset，就是相对比对的position，要将ref右移多少
        for (i = 0; i < cigar_start; ++i) {
            if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
                ref_offset += bam_cigar_oplen(cigar[i]);
            }
        }
        if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
            ref_offset += first_cigar_clip;
        }
        // 计算clip处理之后，剩余的碱基
        bases.resize(read_len);
        base_quals.resize(read_len);
        uint8_t* seq = bam_get_seq(bw->b);
        uint8_t* quals = bam_get_qual(bw->b);
        for (i = 0; i < read_len; ++i) {
            bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i + left_clip)];
            base_quals[i] = quals[i + left_clip];
        }
        // 计算read两端clip之后的softstart和softend，其实S之前都被切掉了
        int64_t softStart = bw->b->core.pos + ref_offset;
        int64_t softEnd = softStart - 1;  // 闭区间
        bool rightTail = false;
        for (i = cigar_start; i < cigar_end; ++i) {
            const char c = bam_cigar_opchr(cigar[i]);
            int len = bam_cigar_oplen(cigar[i]);
            if (i == cigar_start) len -= first_cigar_clip;
            if (i == cigar_end - 1) len -= last_cigar_clip;
            // if (c == 'S' || c == 'I')
            if (c == 'S') softStart -= len;
            else if (c != 'H') rightTail = true;
            if (rightTail && (BaseUtils::consumeRefBases(c) || c == 'S')) softEnd += len;
        }
        this->softStart() = softStart; // 其实这里的softStart就是start
        this->softEnd() = softEnd; // 同上
        // 计算clip之后的cigar，其实可以考虑下边的代码和上边的换一下位置
        for (i = cigar_start; i < cigar_end; ++i) {
            char c = bam_cigar_opchr(cigar[i]);
            int len = bam_cigar_oplen(cigar[i]);
            if (i == cigar_start) len -= first_cigar_clip;
            if (i == cigar_end - 1) len -= last_cigar_clip;
            // if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
            if (i == cigar_start && c == 'D') {  // 跳过开头的deletion
                c = 'H';
                ref_offset += len;
                start_pos += len; // 更新起始位置
            } else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion
                c = 'H';
                softEnd -= len; // 更新结束位置
            }
            cigars.push_back({c, len});
        }
    }
    // 给定一个ref pos，返回对应的read index和cigar操作
    ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
        ReadIdxCigar rc;
        if (refPos < start_pos)
            return rc;
        int firstReadPosOfElement = 0;         // inclusive
        int firstRefPosOfElement = start_pos;  // inclusive
        int lastReadPosOfElement = 0;          // exclusive
        int lastRefPosOfElement = start_pos;   // exclusive
        // advance forward through all the cigar elements until we bracket the reference coordinate
        for (auto& cigar : cigars) {
            firstReadPosOfElement = lastReadPosOfElement;
            firstRefPosOfElement = lastRefPosOfElement;
            lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
            lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
            if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) {  // refCoord falls within this cigar element
                int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
                rc.cigarOp = cigar.op;
                rc.readIdx = readPosAtRefCoord;
                return rc;
            }
        }
        return rc;
    }
 };
--- a/src/util/stable_array.h
+++ b/src/util/stable_array.h
@ -0,0 +1,73 @@
 /*
     Description: 不用经常释放array的内存空间，减少频繁的内存开辟和释放操作
     Copyright : All right reserved by ICT
     Author : Zhang Zhonghai
     Date : 2025/12/29
 */
 #pragma once
 #include <cstdlib>
 #include <vector>
 using std::vector;
 // 不能用于bool类型, 因为在c++中vector<bool>是特化过的
 template <class T>
 struct StableArray {
    vector<T> arr;
    size_t idx = 0;
    void clear() { idx = 0; }
    size_t size() { return idx; }
    bool empty() { return idx == 0; }
    void reserve(size_t _size) {
        if (arr.size() < _size)
            arr.reserve(_size);
    }
    void resize(size_t _size) {
        if (arr.size() < _size) {
            arr.resize(_size);
        }
        idx = _size;
    }
    void resize(size_t _size, const T &val) {
        if (arr.size() < _size) {
            arr.resize(_size);
        }
        for (size_t i = 0; i < _size; ++i) {
            arr[i] = val;
        }
        idx = _size;
    }
    void push_back(const T& val) {
        if (idx < arr.size()) {
            arr[idx++] = val;
        } else {
            arr.push_back(val);
            idx++;
        }
    }
    inline T& operator[](size_t pos) { return arr[pos]; }
    //inline const T& operator[](size_t pos) { return arr[pos]; }
    inline const T& operator[](size_t pos) const { return arr[pos]; }
    struct iterator {
        typename std::vector<T>::iterator it;
        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
        iterator& operator++() {
            ++it;
            return *this;
        }
        iterator& operator--() {
            --it;
            return *this;
        }
        T& operator*() const { return *it; }
        bool operator!=(const iterator& other) const { return it != other.it; }
        bool operator==(const iterator& other) const { return it == other.it; }
    };
    iterator begin() { return arr.begin(); }
    iterator end() { return arr.begin() + idx; }
 };
		`@ -0,0 +1,3 @@`
							`#include "base_utils.h"`

							`const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};`