重构了一下，大数据问题还没解决

2025-12-29 19:36:38 +08:00 · 2025-12-29 19:36:38 +08:00 · 1e5a291eb7
parent 1864736509
commit 1e5a291eb7
15 changed files with 483 additions and 462 deletions
--- a/src/bqsr/baq.cpp
+++ b/src/bqsr/baq.cpp
@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
 vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1)));  // [ref][read][qual]

 // 计算baq数组，返回成功与否
-bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
+bool BAQ::calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
    // 检测ref是否覆盖了read
    if (ref.size() < refOffset + ad.read_len) {
        spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
--- a/src/bqsr/baq.h
+++ b/src/bqsr/baq.h
@ -15,6 +15,7 @@
 #include <vector>

 #include "util/bam_wrap.h"
+#include "util/sam_data.h"

 using std::vector;
 using std::string;
@ -84,5 +85,5 @@ struct BAQ {
    double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }

    // 计算baq数组，返回成功与否
-    bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
+    bool calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray);
 };
--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -39,6 +39,10 @@ Date : 2023/10/23
 #include "util/math/math_utils.h"
 #include "quant_info.h"
 #include "util/debug.h"
+#include "util/stable_array.h"
+#include "util/sam_data.h"
+#include "util/read_transformer.h"
+#include "util/base_utils.h"

 using std::deque;

@ -47,8 +51,6 @@ using std::deque;

 const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';

-const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
-
 // 解析knownSites
 struct VCFParser {
    deque<Interval> knownSites;  // 已知的变异位点
@ -157,222 +159,6 @@ bool bqsrReadFilterOut(const bam1_t *b) {
    return false;
 }

-// 该操作符是否消耗read的碱基
-bool consumeReadBases(char cigar) {
-    return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
-}
-
-// 该操作符是否消耗参考基因组的碱基
-bool consumeRefBases(char cigar) {
-    return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
-}
-
-// 给定一个ref位置，在read内部找到对应的位置和操作符
-struct PosAndOperator {
-    int readPosAtRefCoord = -1;  // read中的位置
-    char cigarOperator = '0';     // cigar操作符
-    int cigarIndex = -1;        // cigar索引
-    int cigarLen = 0;
-    int preCigarLen = 0;  // 截止cigar之前的，消耗read base的长度
-};
-
-/**
- * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
- * the element containing that base.  If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
- * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
- *
- * @param alignmentStart        The soft start of the read on the reference
- * @param cigar                 The read's cigar
- * @param refCoord              The target reference coordinate
- * @return                      If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
- *                              if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
- *                              if the reference coordinate falls within a deletion, the first read coordinate after the deletion.  Note: if the last
- * cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
- */
-PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
-    PosAndOperator po;
-    if (refCoord < alignmentStart) {
-        return po;
-    }
-    int firstReadPosOfElement = 0;              // inclusive
-    int firstRefPosOfElement = alignmentStart;  // inclusive
-    int lastReadPosOfElement = 0;               // exclusive
-    int lastRefPosOfElement = alignmentStart;   // exclusive
-
-    // advance forward through all the cigar elements until we bracket the reference coordinate
-    const uint32_t* cigar = bam_get_cigar(bw->b);
-    const bam1_core_t& bc = bw->b->core;
-    const int idx = bc.n_cigar - 1;
-    if (idx < 0)
-        return po;
-    for (int i = 0; i < bc.n_cigar; ++i) {
-        const char c = bam_cigar_opchr(cigar[i]);
-        const int len = bam_cigar_oplen(cigar[i]);
-        firstReadPosOfElement = lastReadPosOfElement;
-        firstRefPosOfElement = lastRefPosOfElement;
-        lastReadPosOfElement += consumeReadBases(c) ? len : 0;
-        lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
-        if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) {  // refCoord falls within this cigar element
-            int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
-            return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
-        }
-    }
-    return po;
-}
-
-// 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
-void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) {
-    int start, stop;
-    // Determine the read coordinate to start and stop hard clipping
-    if (refStart < 0) {
-        if (refStop < 0) return;
-        PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
-        // if the refStop falls in a deletion, the above method returns the position after the deletion.  Since the stop we return here
-        // is inclusive, we decrement the stop to avoid overclipping by one base.  As a result we do not clip the deletion, which is fine.
-        stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
-        sd.left_clip = stop + 1;
-        sd.cigar_start = stopPosAndOperator.cigarIndex;
-        sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
-    } else {
-        if (refStop >= 0) return;
-        // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
-        // overclipping
-        PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
-        start = startPosAndOperator.readPosAtRefCoord;
-        sd.right_clip = bw->b->core.l_qseq - start;
-        sd.cigar_end = startPosAndOperator.cigarIndex + 1;
-        sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
-    }
-}
-
-// 计算切掉adapter之后，ref相对原始ref的偏移量
-void calculateRefOffset(BamWrap *bw, SamData &sd) {
-    const uint32_t* cigar = bam_get_cigar(bw->b);
-    const bam1_core_t& bc = bw->b->core;
-    int i = 0;
-    for (i = 0; i < sd.cigar_start; ++i) {
-        const char c = bam_cigar_opchr(cigar[i]);
-        int len = bam_cigar_oplen(cigar[i]);
-        if (consumeRefBases(c)) {
-            sd.ref_offset += len;
-        }
-    }
-    const char c = bam_cigar_opchr(cigar[i]);
-    if (consumeRefBases(c)) {
-        sd.ref_offset += sd.first_cigar_clip;
-    }
-}
-
-// 计算clip处理之后，剩余的碱基
-void calculateReadBases(BamWrap* bw, SamData& sd) {
-    sd.bases.resize(sd.read_len);
-    sd.base_quals.resize(sd.read_len);
-    uint8_t* seq = bam_get_seq(bw->b);
-    uint8_t* quals = bam_get_qual(bw->b);
-    for (int i = 0; i < sd.read_len; ++i) {
-        sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)];
-        sd.base_quals[i] = quals[i + sd.left_clip];
-    }
-}
-
-// 计算read两端clip之后的softstart和softend
-void calculateSoftStartEnd(BamWrap* bw, SamData& sd) {
-    int64_t softStart = bw->b->core.pos + sd.ref_offset;
-    int64_t softEnd = softStart - 1; // 闭区间
-    const uint32_t* cigar = bam_get_cigar(bw->b);
-    const bam1_core_t& bc = bw->b->core;
-    int cigar_start = sd.cigar_start;
-    int cigar_end = sd.cigar_end;
-    bool rightTail = false;
-    for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
-        const char c = bam_cigar_opchr(cigar[i]);
-        int len = bam_cigar_oplen(cigar[i]);
-        if (i == sd.cigar_start) len -= sd.first_cigar_clip;
-        if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
-        // if (c == 'S' || c == 'I')
-        if (c == 'S')
-            softStart -= len;
-        else if (c != 'H')
-            rightTail = true;
-        if (rightTail) {
-            if (consumeRefBases(c) || c == 'S')
-                softEnd += len;
-        }
-    }
-    sd.softStart() = softStart;
-    sd.softEnd() = softEnd;
-}
-
-// 计算clip之后的cigar
-void calculateCigar(BamWrap* bw, SamData& sd) {
-    sd.cigars.clear();
-    const uint32_t* cigar = bam_get_cigar(bw->b);
-    const bam1_core_t& bc = bw->b->core;
-    int cigar_start = sd.cigar_start;
-    int cigar_end = sd.cigar_end;
-    for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
-        char c = bam_cigar_opchr(cigar[i]);
-        int len = bam_cigar_oplen(cigar[i]);
-        if (i == sd.cigar_start)
-            len -= sd.first_cigar_clip;
-        if (i == sd.cigar_end - 1)
-            len -= sd.last_cigar_clip;
-        //if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
-        if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion
-            c = 'H';
-            // sd.ref_offset += len;
-            sd.start_pos += len;
-        }
-        sd.cigars.push_back({c, len});
-    }
-    //for(auto &cigar : sd.cigars) {
-    //    spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
-    //}
-}
-
-// 计算read两端softclip的碱基数量，可能会修改ad里的clip值
-void calculateSoftClip(BamWrap *bw, SamData &sd) {
-    const uint32_t* cigar = bam_get_cigar(bw->b);
-    const bam1_core_t& bc = bw->b->core;
-    int readIndex = sd.left_clip;
-    int cutLeft = -1;           // first position to hard clip (inclusive)
-    int cutRight = -1;          // first position to hard clip (inclusive)
-    int cigar_start = sd.cigar_start;
-    int cigar_end = sd.cigar_end;
-    bool rightTail = false;  // trigger to stop clipping the left tail and start cutting the right tail
-
-    for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
-        const char c = bam_cigar_opchr(cigar[i]);
-        int len = bam_cigar_oplen(cigar[i]);
-        if (i == sd.cigar_start) len -= sd.first_cigar_clip;
-        if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
-        if (c == 'S') {
-            if (rightTail) {
-                cutRight = readIndex;
-                cigar_end = i;
-            } else {
-                cutLeft = readIndex + len - 1;
-                cigar_start = i + 1;
-            }
-        } else if (c != 'H') {
-            rightTail = true;
-        }
-        if (consumeReadBases(c)) {
-            readIndex += len;
-        }
-    }
-    if (cutRight >= 0) {
-        sd.right_clip = bw->b->core.l_qseq - cutRight;
-        sd.cigar_end = cigar_end;
-        sd.last_cigar_clip = 0;
-    }
-    if (cutLeft >= 0) {
-        sd.left_clip = cutLeft + 1;
-        sd.cigar_start = cigar_start;
-        sd.first_cigar_clip = 0;
-    }
-}
-
 // 读取给定区间的reference
 static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
    if (aux.ref_seq != NULL)
@ -386,22 +172,23 @@ static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interva
 }

 // 设置某个位置是indel
-inline void updateIndel(vector<int> &isIndel, int index) {
+inline void updateIndel(StableArray<int>& isIndel, int index) {
    if (index >=0 && index < isIndel.size()) {
        isIndel[index] = 1;
    }
 }

 // 计算该read的每个碱基位置是否是SNP或Indel
-int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
+int calculateIsSNPOrIndel(AuxVar& aux, SamData& sd, StableArray<int>& isSNP, StableArray<int>& isIns, StableArray<int>& isDel) {
+    isSNP.resize(sd.read_len, 0);
+    isIns.resize(sd.read_len, 0);
+    isDel.resize(sd.read_len, 0);
    // 1. 读取参考基因组，先看看串行运行性能，稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
-    //Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间
    Interval interval{sd.start_pos, sd.end_pos};  // 闭区间
    PROF_START(ref);
    read_ref_base(aux, interval.left, interval);
    PROF_END(gprof[GP_read_ref], ref);
    string refBases(aux.ref_seq);
-    // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());

    // 2. 遍历cigar，计算每个碱基是否是SNP或Indel
    int readPos = 0, refPos = 0, nEvents = 0;
@ -412,7 +199,6 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
            for (int j = 0; j < len; ++j) {
                // 按位置将read和ref碱基进行比较，不同则是snp，注意read起始位置要加上left_clip
                int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1;
-                // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
                isSNP[readPos] = snpInt;
                nEvents += snpInt;
                readPos++;
@ -439,59 +225,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
            readPos += len;
        }
    }
-
-    // const uint32_t* cigar = bam_get_cigar(bw->b);
-    // const bam1_core_t& bc = bw->b->core;
-    // uint8_t* seq = bam_get_seq(bw->b);
-    // for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
-    //     const char c = bam_cigar_opchr(cigar[i]);
-    //     int len = bam_cigar_oplen(cigar[i]);
-    //     if (i == sd.cigar_start) len -= sd.first_cigar_clip;
-    //     if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
-    //     if (c == 'M' || c == '=' || c == 'X') {
-    //         for (int j = 0; j < len; ++j) {
-    //             // 按位置将read和ref碱基进行比较，不同则是snp，注意read起始位置要加上left_clip
-    //             int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1;
-    //             // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
-    //             isSNP[readPos] = snpInt;
-    //             nEvents += snpInt;
-    //             readPos++;
-    //             refPos++;
-    //         }
-    //     } else if (c == 'D') {
-    //         // 应该是在上一个消耗碱基的cigar的最后一个位置，标记Del
-    //         int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
-    //         updateIndel(isDel, index);
-    //         refPos += len;
-    //     } else if (c == 'N') {
-    //         refPos += len;
-    //     } else if (c == 'I') {
-    //         // 与Del不同，Ins应该是在下一个cigar开始的位置，标记Ins
-    //         bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
-    //         if (forwardStrandRead) {
-    //             updateIndel(isIns, readPos - 1);
-    //         }
-    //         readPos += len;
-    //         if (!forwardStrandRead) {
-    //             updateIndel(isIns, readPos);
-    //         }
-    //     } else if (c == 'S') {
-    //         readPos += len;
-    //     }
-    // }
-
    nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
-    // spdlog::info("nEvents: {}", nEvents);
-
-    //spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
-    //             std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
-    // exit(0);

    return nEvents;
 }

 // 简单计算baq数组，就是全部赋值为'@' (64)
-bool flatBAQArray(SamData& sd, vector<uint8_t>& baqArray) {
+bool flatBAQArray(SamData& sd, StableArray<uint8_t>& baqArray) {
    baqArray.resize(sd.read_len, (uint8_t)'@');
    return true;
 }
@ -514,12 +254,13 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
 }

 // 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
-void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
+void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, StableArray<uint8_t>& knownSites) {
    BamWrap* bw = sd.bw;
    int tid = bw->contig_id();
    uint64_t startPos = bw->start_pos();   // 闭区间
    uint64_t endPos = bw->end_pos(); // 闭区间
-    // spdlog::info("bam {}, {}", startPos, endPos);
+    knownSites.resize(sd.read_len, 0);
+
    // update vcfs
    for(auto &vcf : vcfs) {
        // 清理旧的interval
@ -533,15 +274,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
        }
        if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;

-        // spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
-        // exit(0);
-        //spdlog::info("before intervals : {}", vcf.knownSites.size());
        // 读取新的interval
        int64_t fpos, flen;
        endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
        Interval readIntv(startPos, endPos);
        vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
-        //spdlog::info("file index: {}, {}", fpos, flen);
        if (flen > 0) {
            vcf.inStm.seekg(fpos, ios::beg);
            if (flen > vcf.bufLen) {
@ -565,20 +302,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
                Interval varIntv(varStart, varStart + ref.size() - 1);
                if (readIntv.overlaps(varIntv)) {
                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
-                    //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
                }
                get_line_from_buf(buf, flen, &cur, &line);
            }
        }
-        //spdlog::info("after intervals : {}", vcf.knownSites.size());
-        //for(auto &val : vcf.knownSites) {
-        //    spdlog::info("intv {}, {}", val.left, val.right);
-        //}
    }
-    //exit(0);
-    knownSites.resize(sd.read_len);
-    endPos = bw->end_pos();
-    
    for(auto &vcf : vcfs) {
        for (auto &intv : vcf.knownSites) {
            // knownSite is outside clipping window for the read, ignore
@ -602,11 +330,10 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
            }
        }
    }
-
 }

 // 应该是计算一段数据的平均值
-static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
+static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, StableArray<int>& errorArr, StableArray<double>& fracErrs) {
    int totalErrors = 0;
    for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
        totalErrors += errorArr[j];
@ -617,9 +344,8 @@ static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<in
 }

 // 应该是用来处理BAQ的，把不等于特定BAQ分数的碱基作为一段数据统一处理
-void calculateFractionalErrorArray(vector<int>& errorArr, vector<uint8_t>& baqArr, vector<double>& fracErrs) {
-    // for (auto val : errorArr) { if (val > 0) spdlog::info("snp err val: {}", val); }
-    fracErrs.resize(baqArr.size());
+void calculateFractionalErrorArray(StableArray<int>& errorArr, StableArray<uint8_t>& baqArr, StableArray<double>& fracErrs) {
+    fracErrs.resize(baqArr.size(), 0.0);
    // errorArray和baqArray必须长度相同
    const int BLOCK_START_UNSET = -1;
    bool inBlock = false;
@ -805,11 +531,8 @@ int SerialBQSR() {
        ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
    }

-    int test = 0;
-
-    while (1) {
+    while (true) {
        ++ round;
-
        // 一. 读取bam数据
        size_t readNum = 0;
        if (inBamBuf.ReadStat() >= 0)
@ -818,52 +541,38 @@ int SerialBQSR() {
            break;
        }
        auto bams = inBamBuf.GetBamArr();
-        spdlog::info("{} reads processed in {} round, {}", readNum, round, test);
+        spdlog::info("{} reads processed in {} round", readNum, round);

        // 二. 遍历每个bam（read）记录，进行处理
+        SamData sd;
+        StableArray<int> isSNP, isIns, isDel;  // 该位置是否是SNP, indel位置，0不是，1是
+        StableArray<uint8_t> baqArray;
+        StableArray<double> snpErrors, insErrors, delErrors;
+        StableArray<uint8_t> skips;  // 该位置是否是已知位点

        for (int i = 0; i < bams.size(); ++i) {
            // 1. 对每个read，需要检查cigar是否合法，即没有两个连续的相同的cigar，而且需要将首尾的deletion处理掉，目前看好像没啥影响，我们忽略这一步
            // 2. 对质量分数长度跟碱基长度不匹配的read，缺少的质量分数用默认值补齐，先忽略，后边有需要再处理
            // 3. 如果bam文件之前做过bqsr，tag中包含OQ（originnal quality，原始质量分数），检查用户参数里是否指定用原始质量分数进行bqsr，如果是则将质量分数替换为OQ，否则忽略OQ，先忽略
-            // 4. 对read的两端进行检测，去除（hardclip）adapter
            // spdlog::info("bam idx: {}", i);
            BamWrap* bw = bams[i];
-            SamData sd;
+            sd.init();
+            sd.parseBasic(bw);
            sd.rid = i + readNumSum;
-            sd.bw = bw;
-            sd.read_len = BamWrap::BamEffectiveLength(bw->b);
-            sd.cigar_end = bw->b->core.n_cigar;
            if (sd.read_len <= 0) continue;

-            int adapter_boundary = bw->GetAdapterBoundary();
-            if (bw->IsAdapterInRead(adapter_boundary)) {
-                // adapter在read范围内
-                if (bw->GetReadNegativeStrandFlag()) { // 反链
-                    clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
-                } else { // 正链
-                    clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
-                }
-            }
-            sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
+            PROF_START(clip_read);
+            // 4. 对read的两端进行检测，去除（hardclip）adapter
+            ReadTransformer::hardClipAdaptorSequence(bw, sd); 
+            if (sd.read_len <= 0) continue;
            // 5. 然后再去除softclip部分
-            calculateSoftClip(bw, sd);
-            sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip;  // 更新read长度
+            ReadTransformer::hardClipSoftClippedBases(bw, sd);
            if (sd.read_len <= 0) continue;
-
-            calculateRefOffset(bw, sd); // 计算ref_offset，就是相对比对的position，要将ref右移多少
-            calculateReadBases(bw, sd);  // 计算clip处理之后，剩余的碱基
-            // 计算clip之后，两端的softstart和softend
-            calculateSoftStartEnd(bw, sd);
-            calculateCigar(bw, sd);
-
-            //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
-            //             sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str());
+            // 应用所有的变换，计算samdata的相关信息
+            sd.applyTransformations();
+            PROF_END(gprof[GP_clip_read], clip_read);

            // 6. 更新每个read的platform信息，好像没啥用，暂时忽略
-            vector<int> isSNP(sd.read_len, 0);     // 该位置是否是SNP位置，0不是，1是
-            vector<int> isIns(sd.read_len, 0); // 该位置是否是插入位置，0不是，1是
-            vector<int> isDel(sd.read_len, 0); // 该位置是否是删除位置，0不是，1是
            const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel);

            /*fprintf(gf[0], "%d\t", sd.read_len);
@ -877,16 +586,11 @@ int SerialBQSR() {
            fprintf(gf[2], "\n");
            */

-            // spdlog::info("nErrors: {}", nErrors);
-            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
-
-            //exit(0);
-
            // 7. 计算baqArray
            // BAQ = base alignment quality
            // note for efficiency reasons we don't compute the BAQ array unless we actually have
            // some error to marginalize over.  For ILMN data ~85% of reads have no error
-            vector<uint8_t> baqArray;
+            // vector<uint8_t> baqArray;
            bool baqCalculated = false;
            if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
                baqCalculated = flatBAQArray(sd, baqArray);
@ -922,13 +626,7 @@ int SerialBQSR() {
            // }
            // fprintf(gf[3], "\n");

-            //test = readCovariates[1][0][0] + readCovariates[2][1][3];
-            //int end_pos = bw->contig_end_pos();
-            //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
-            //             bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
-            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
            // 9. 计算这条read需要跳过的位置
-            vector<bool> skips(sd.read_len, 0);
            PROF_START(known_sites);
            calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips);
            for (int ii = 0; ii < sd.read_len; ++ii) {
@ -936,27 +634,19 @@ int SerialBQSR() {
                            sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
            }
            PROF_END(gprof[GP_read_vcf], known_sites);
-            // fprintf(gf[0], "%d\t", sd.read_len);
+            // fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len);
            // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
            // fprintf(gf[0], "\n");

            // 10. 根据BAQ进一步处理snp，indel，得到处理后的数据
-            vector<double> snpErrors, insErrors, delErrors;
-            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-2: {}", val); }
            calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
            calculateFractionalErrorArray(isIns, baqArray, insErrors);
            calculateFractionalErrorArray(isDel, baqArray, delErrors);

-            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
-            //spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len);
-            //for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); }
-
            // aggregate all of the info into our info object, and update the data
            // 11. 合并之前计算的数据，得到info，并更新bqsr table数据
            ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
-            int m = 0;
-            // for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); }
-            //exit(0);
+
            PROF_START(update_info);
            updateRecalTablesForRead(info);
            PROF_END(gprof[GP_update_info], update_info);
--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -99,7 +99,7 @@ static char SimpleComplement(const char base) {
    }
 }

-static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
+static void ClipLowQualEndsWithN(string& bases, const StableArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
    // 处理左边
    int left = 0;
    int readLen = bases.size();
@ -245,7 +245,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
    const int originalReadLength = sd.read_len;

    // store the original bases and then write Ns over low quality ones
-    string strandedClippedBases(sd.bases);
+    string strandedClippedBases(sd.bases.arr.data(), sd.read_len);
    // GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
    GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail);  // 命名我之前看到过这个30的？
    // spdlog::info("bases: {}", strandedClippedBases);
--- a/src/bqsr/covariate.h
+++ b/src/bqsr/covariate.h
@ -19,6 +19,7 @@

 #include "bqsr_args.h"
 #include "util/bam_wrap.h"
+#include "util/sam_data.h"

 using std::map;
 using std::string;
--- a/src/bqsr/read_recal_info.h
+++ b/src/bqsr/read_recal_info.h
@ -9,19 +9,23 @@

 #pragma once

-#include "util/bam_wrap.h"
+#include <vector>
+
 #include "covariate.h"
+#include "util/bam_wrap.h"
+#include "util/stable_array.h"
+using std::vector;

 struct ReadRecalInfo {
    SamData& read;
    int length;
    PerReadCovariateMatrix& covariates;
-    vector<bool>& skips;
-    FastArray<uint8_t>&base_quals, &ins_quals, &del_quals;
-    vector<double>&snp_errs, &ins_errs, &del_errs;
+    StableArray<uint8_t>& skips;
+    StableArray<uint8_t>&base_quals, &ins_quals, &del_quals;
+    StableArray<double>&snp_errs, &ins_errs, &del_errs;

-    ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, vector<bool>& _skips, vector<double>& _snp_errs, vector<double>& _ins_errs,
-                  vector<double>& _del_errs)
+    ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, StableArray<uint8_t>& _skips, StableArray<double>& _snp_errs,
+                  StableArray<double>& _ins_errs, StableArray<double>& _del_errs)
        : read(_read),
          covariates(_covariates),
          skips(_skips),
--- a/src/util/bam_wrap.h
+++ b/src/util/bam_wrap.h
@ -21,111 +21,6 @@

 using namespace std;

-struct Cigar {
-    char op = '0';
-    int len = 0;
-    // 该操作符是否消耗read的碱基
-    static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
-    // 该操作符是否消耗参考基因组的碱基
-    static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
-};
-
-struct ReadIdxCigar {
-    int readIdx = -1; // 在read序列中的位置
-    char cigarOp = '0'; // 当前位置对应的cigar
-};
-
-// 不用经常释放array的内存空间，减少频繁的内存开辟和释放操作
-template <class T>
-struct FastArray {
-    vector<T> arr;
-    size_t idx;
-    void clear() { idx = 0; }
-    size_t size() { return idx; }
-    bool empty() { return idx == 0; }
-    void reserve(size_t _size) { arr.reserve(_size); }
-    void resize(size_t _size) {
-        arr.resize(_size);
-        idx = _size;
-    }
-    void push_back(const T& val) {
-        if (idx < arr.size()) {
-            arr[idx++] = val;
-        } else {
-            arr.push_back(val);
-            idx++;
-        }
-    }
-    inline T& operator[](size_t pos) { return arr[pos]; }
-    inline const T& operator[](size_t pos) const { return arr[pos]; }
-    struct iterator {
-        typename std::vector<T>::iterator it;
-        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
-        iterator& operator++() { ++it; return *this;}
-        iterator& operator--() { --it; return *this;}
-        T& operator*() const { return *it; }
-        bool operator!=(const iterator& other) const { return it != other.it; }
-        bool operator==(const iterator& other) const { return it == other.it; }
-    };
-
-    iterator begin() { return arr.begin(); }
-    iterator end() { return arr.begin() + idx; }
-};
-
-// 对原始bam数据的补充，比如对两端进行hardclip等
-class BamWrap;
-struct SamData {
-    int64_t rid = 0; // for debug
-    int read_len = 0;          // read长度，各种clip之后的长度
-    int cigar_start = 0;       // cigar起始位置，闭区间
-    int cigar_end = 0;         // cigar结束位置，开区间
-    int first_cigar_clip = 0;  // 第一个cigar, clip的数量，切左侧
-    int last_cigar_clip = 0;   // 最后一个cigar, clip的数量，切右侧
-    int left_clip = 0;         // 左侧被切掉的碱基长度，BI和BD质量分数也会用到
-    int right_clip = 0;        // 右侧被切掉的碱基长度
-    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
-
-    // 记录一下bqsr运算过程中用到的数据，回头提前计算一下，修正现在的复杂逻辑
-    static constexpr int READ_INDEX_NOT_FOUND = -1;
-
-    BamWrap* bw;
-    int64_t start_pos;      // 因为soft clip都被切掉了，这里的softstart应该就是切掉之后的匹配位点，闭区间
-    int64_t end_pos;        // 同上，闭区间
-    string bases;           // 处理之后的read的碱基
-    FastArray<uint8_t> base_quals;  // 对应的质量分数
-    FastArray<uint8_t> ins_quals;  // insert质量分数, BI (大部分应该都没有)
-    FastArray<uint8_t> del_quals;      // delete质量分数, BD (大部分应该都没有)
-
-    FastArray<Cigar> cigars;
-    int64_t& softStart() { return start_pos; }
-    int64_t& softEnd() { return end_pos; }
-
-    // functions
-    ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
-        ReadIdxCigar rc;
-        if (refPos < start_pos)
-            return rc;
-        int firstReadPosOfElement = 0;         // inclusive
-        int firstRefPosOfElement = start_pos;  // inclusive
-        int lastReadPosOfElement = 0;          // exclusive
-        int lastRefPosOfElement = start_pos;   // exclusive
-        // advance forward through all the cigar elements until we bracket the reference coordinate
-        for (auto& cigar : cigars) {
-            firstReadPosOfElement = lastReadPosOfElement;
-            firstRefPosOfElement = lastRefPosOfElement;
-            lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
-            lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
-            if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) {  // refCoord falls within this cigar element
-                int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
-                rc.cigarOp = cigar.op;
-                rc.readIdx = readPosAtRefCoord;
-                return rc;
-            }
-        }
-        return rc;
-    }
-};
-
 /*
    这里的成员函数命名有点混乱，特此说明，小写加下划线的函数命名，无论是静态函数，还是普通成员函数，更侧重说明
    这是类似bam的一个属性，而大写加驼峰命名的函数，更侧重说明这是通过计算得出的。
--- a/src/util/base_utils.cpp
+++ b/src/util/base_utils.cpp
@ -0,0 +1,3 @@
+#include "base_utils.h"
+
+const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
--- a/src/util/base_utils.h
+++ b/src/util/base_utils.h
@ -0,0 +1,20 @@
+/*
+     Description: 碱基相关的工具函数
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2025/12/29
+*/
+#pragma once
+
+struct BaseUtils {
+    // uint8_t转碱基字符
+    static const char cBaseToChar[16];
+
+    // 该操作符是否消耗read的碱基
+    static bool consumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
+
+    // 该操作符是否消耗参考基因组的碱基
+    static bool consumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
+};
--- a/src/util/profiling.cpp
+++ b/src/util/profiling.cpp
@ -54,6 +54,7 @@ int DisplayProfiling(int nthread) {
    // PRINT_GP(sort_wait);
    // PRINT_GP(markdup_wait);
    // PRINT_GP(intersect_wait);
+    PRINT_GP(clip_read);
    PRINT_GP(read_ref);
    PRINT_GP(read_vcf);
    PRINT_GP(covariate);
--- a/src/util/profiling.h
+++ b/src/util/profiling.h
@ -40,6 +40,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
 enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
 enum {
    GP_read_wait = 11,
+    GP_clip_read,
    GP_covariate,
    GP_read_ref,
    GP_read_vcf,
--- a/src/util/read_transformer.h
+++ b/src/util/read_transformer.h
@ -0,0 +1,154 @@
+/*
+     Description: read (sam record) 相关的工具函数，比如用于clipping低质量碱基等
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2025/12/29
+*/
+#pragma once
+
+#include "bam_wrap.h"
+#include "sam_data.h"
+#include "base_utils.h"
+
+// 用于对read进行各种转换操作，比如clipping等。注意这里都是逻辑操作，最后需要调用SamData.applyTransformations()来真正应用这些修改
+struct ReadTransformer {
+    // 给定一个ref位置，在read内部找到对应的位置和操作符
+    struct PosAndOperator {
+        int readPosAtRefCoord = -1;  // read中的位置
+        char cigarOperator = '0';    // cigar操作符
+        int cigarIndex = -1;         // cigar索引
+        int cigarLen = 0;
+        int preCigarLen = 0;  // 截止cigar之前的，消耗read base的长度
+    };
+
+    /**
+     * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
+     * the element containing that base.  If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
+     * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
+     *
+     * @param alignmentStart        The soft start of the read on the reference
+     * @param cigar                 The read's cigar
+     * @param refCoord              The target reference coordinate
+     * @return                      If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
+     *                              if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
+     * coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion.  Note: if the last cigar element
+     * is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
+     */
+    static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
+        PosAndOperator po;
+        if (refCoord < alignmentStart) {
+            return po;
+        }
+        int firstReadPosOfElement = 0;              // inclusive
+        int firstRefPosOfElement = alignmentStart;  // inclusive
+        int lastReadPosOfElement = 0;               // exclusive
+        int lastRefPosOfElement = alignmentStart;   // exclusive
+
+        // advance forward through all the cigar elements until we bracket the reference coordinate
+        const uint32_t* cigar = bam_get_cigar(bw->b);
+        const bam1_core_t& bc = bw->b->core;
+        const int idx = bc.n_cigar - 1;
+        if (idx < 0)
+            return po;
+        for (int i = 0; i < bc.n_cigar; ++i) {
+            const char c = bam_cigar_opchr(cigar[i]);
+            const int len = bam_cigar_oplen(cigar[i]);
+            firstReadPosOfElement = lastReadPosOfElement;
+            firstRefPosOfElement = lastRefPosOfElement;
+            lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
+            lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
+            if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) {  // refCoord falls within this cigar element
+                int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
+                return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
+            }
+        }
+        return po;
+    }
+
+    // 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
+    static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
+        int start, stop;
+        // Determine the read coordinate to start and stop hard clipping
+        if (refStart < 0) {
+            if (refStop < 0)
+                return;
+            PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
+            // if the refStop falls in a deletion, the above method returns the position after the deletion.  Since the stop we return here
+            // is inclusive, we decrement the stop to avoid overclipping by one base.  As a result we do not clip the deletion, which is fine.
+            stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
+            sd.left_clip = stop + 1;
+            sd.cigar_start = stopPosAndOperator.cigarIndex;
+            sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
+        } else {
+            if (refStop >= 0)
+                return;
+            // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
+            // avoids overclipping
+            PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
+            start = startPosAndOperator.readPosAtRefCoord;
+            sd.right_clip = bw->b->core.l_qseq - start;
+            sd.cigar_end = startPosAndOperator.cigarIndex + 1;
+            sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
+        }
+    }
+
+    // 切掉adaper序列，注意这里的clipping只是逻辑上的，实际并没有修改bam record
+    static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
+        int adapter_boundary = bw->GetAdapterBoundary();
+        if (bw->IsAdapterInRead(adapter_boundary)) {
+            // adapter在read范围内
+            if (bw->GetReadNegativeStrandFlag()) {  // 反链
+                clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
+            } else {  // 正链
+                clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
+            }
+        }
+        sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip;  // 更新read长度
+    }
+
+    // 计算read两端softclip的碱基数量，切掉softclip序列
+    static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
+        const uint32_t* cigar = bam_get_cigar(bw->b);
+        const bam1_core_t& bc = bw->b->core;
+        int readIndex = sd.left_clip;
+        int cutLeft = -1;   // first position to hard clip (inclusive)
+        int cutRight = -1;  // first position to hard clip (inclusive)
+        int cigar_start = sd.cigar_start;
+        int cigar_end = sd.cigar_end;
+        bool rightTail = false;  // trigger to stop clipping the left tail and start cutting the right tail
+
+        for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
+            const char c = bam_cigar_opchr(cigar[i]);
+            int len = bam_cigar_oplen(cigar[i]);
+            if (i == sd.cigar_start) len -= sd.first_cigar_clip;
+            if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
+            if (c == 'S') {
+                if (rightTail) {
+                    cutRight = readIndex;
+                    cigar_end = i;
+                } else {
+                    cutLeft = readIndex + len - 1;
+                    cigar_start = i + 1;
+                }
+            } else if (c != 'H') {
+                rightTail = true;
+            }
+            if (BaseUtils::consumeReadBases(c)) {
+                readIndex += len;
+            }
+        }
+        if (cutRight >= 0) {
+            sd.right_clip = bw->b->core.l_qseq - cutRight;
+            sd.cigar_end = cigar_end;
+            sd.last_cigar_clip = 0;
+        }
+        if (cutLeft >= 0) {
+            sd.left_clip = cutLeft + 1;
+            sd.cigar_start = cigar_start;
+            sd.first_cigar_clip = 0;
+        }
+        sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip;  // 更新read长度
+    }
+};
--- a/src/util/read_utils.h
+++ b/src/util/read_utils.h
--- a/src/util/sam_data.h
+++ b/src/util/sam_data.h
@ -0,0 +1,178 @@
+/*
+     Description: 对原始bam进行部分数据解析，以更方便后续处理
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2025/12/29
+*/
+#pragma once
+
+#include <cstdint>
+
+#include "bam_wrap.h"
+#include "stable_array.h"
+#include "base_utils.h"
+
+// 对cigar进行简单包装
+struct Cigar {
+    char op = '0';
+    int len = 0;
+    // 该操作符是否消耗read的碱基
+    static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
+    // 该操作符是否消耗参考基因组的碱基
+    static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
+};
+
+// 在read中某个特定位置对应的cigar操作符
+struct ReadIdxCigar {
+    int readIdx = -1;    // 在read序列中的位置
+    char cigarOp = '0';  // 当前位置对应的cigar
+};
+
+// 对原始bam进行部分数据解析，以更方便后续处理
+struct SamData {
+    int64_t rid = 0;           // for debug
+
+    // 这些数据用于最开始clipping等操作
+    int read_len = 0;          // read长度，各种clip之后的长度
+    int cigar_start = 0;       // cigar起始位置，闭区间
+    int cigar_end = 0;         // cigar结束位置，开区间
+    int first_cigar_clip = 0;  // 第一个cigar, clip的数量，切左侧
+    int last_cigar_clip = 0;   // 最后一个cigar, clip的数量，切右侧
+    int left_clip = 0;         // 左侧被切掉的碱基长度，BI和BD质量分数也会用到
+    int right_clip = 0;        // 右侧被切掉的碱基长度
+    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
+
+    // 记录一下bqsr运算过程中用到的数据，回头提前计算一下，修正现在的复杂逻辑
+    static constexpr int READ_INDEX_NOT_FOUND = -1;
+
+    BamWrap* bw;
+    int64_t start_pos;              // 因为soft clip都被切掉了，这里的softstart应该就是切掉之后的匹配位点，闭区间
+    int64_t end_pos;                // 同上，闭区间
+    StableArray<char> bases; // 处理之后的read的碱基数组形式
+    StableArray<uint8_t> base_quals;  // 对应的质量分数
+    StableArray<uint8_t> ins_quals;   // insert质量分数, BI (大部分应该都没有)
+    StableArray<uint8_t> del_quals;   // delete质量分数, BD (大部分应该都没有)
+    StableArray<Cigar> cigars;
+
+    int64_t& softStart() { return start_pos; }
+    int64_t& softEnd() { return end_pos; }
+
+    // functions
+
+    // 初始化
+    void init() {
+        bases.clear();
+        base_quals.clear();
+        ins_quals.clear();
+        del_quals.clear();
+        cigars.clear();
+        rid = 0;
+        read_len = 0;
+        cigar_start = 0;
+        cigar_end = 0;
+        first_cigar_clip = 0;
+        last_cigar_clip = 0;
+        left_clip = 0;
+        right_clip = 0;
+        ref_offset = 0;
+        bw = nullptr;
+        start_pos = 0;
+        end_pos = 0;
+    }
+
+    // 初步解析bam
+    void parseBasic(BamWrap *_bw) {
+        bw = _bw;
+        read_len = BamWrap::BamEffectiveLength(bw->b);
+        cigar_end = bw->b->core.n_cigar;
+    }
+
+    // 应用各种转换操作，更新SamData中的数据
+    void applyTransformations() {
+        const uint32_t* cigar = bam_get_cigar(bw->b);
+        const bam1_core_t& bc = bw->b->core;
+        int i = 0;
+
+        // 计算ref_offset，就是相对比对的position，要将ref右移多少
+        for (i = 0; i < cigar_start; ++i) {
+            if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
+                ref_offset += bam_cigar_oplen(cigar[i]);
+            }
+        }
+        if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
+            ref_offset += first_cigar_clip;
+        }
+
+        // 计算clip处理之后，剩余的碱基
+        bases.resize(read_len);
+        base_quals.resize(read_len);
+        uint8_t* seq = bam_get_seq(bw->b);
+        uint8_t* quals = bam_get_qual(bw->b);
+        for (i = 0; i < read_len; ++i) {
+            bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i + left_clip)];
+            base_quals[i] = quals[i + left_clip];
+        }
+
+        // 计算read两端clip之后的softstart和softend，其实S之前都被切掉了
+        int64_t softStart = bw->b->core.pos + ref_offset;
+        int64_t softEnd = softStart - 1;  // 闭区间
+        bool rightTail = false;
+        for (i = cigar_start; i < cigar_end; ++i) {
+            const char c = bam_cigar_opchr(cigar[i]);
+            int len = bam_cigar_oplen(cigar[i]);
+            if (i == cigar_start) len -= first_cigar_clip;
+            if (i == cigar_end - 1) len -= last_cigar_clip;
+            // if (c == 'S' || c == 'I')
+            if (c == 'S') softStart -= len;
+            else if (c != 'H') rightTail = true;
+            if (rightTail && (BaseUtils::consumeRefBases(c) || c == 'S')) softEnd += len;
+        }
+        this->softStart() = softStart; // 其实这里的softStart就是start
+        this->softEnd() = softEnd; // 同上
+
+        // 计算clip之后的cigar，其实可以考虑下边的代码和上边的换一下位置
+        for (i = cigar_start; i < cigar_end; ++i) {
+            char c = bam_cigar_opchr(cigar[i]);
+            int len = bam_cigar_oplen(cigar[i]);
+            if (i == cigar_start) len -= first_cigar_clip;
+            if (i == cigar_end - 1) len -= last_cigar_clip;
+            // if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
+            if (i == cigar_start && c == 'D') {  // 跳过开头的deletion
+                c = 'H';
+                ref_offset += len;
+                start_pos += len; // 更新起始位置
+            } else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion
+                c = 'H';
+                softEnd -= len; // 更新结束位置
+            }
+            cigars.push_back({c, len});
+        }
+    }
+
+    // 给定一个ref pos，返回对应的read index和cigar操作
+    ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
+        ReadIdxCigar rc;
+        if (refPos < start_pos)
+            return rc;
+        int firstReadPosOfElement = 0;         // inclusive
+        int firstRefPosOfElement = start_pos;  // inclusive
+        int lastReadPosOfElement = 0;          // exclusive
+        int lastRefPosOfElement = start_pos;   // exclusive
+        // advance forward through all the cigar elements until we bracket the reference coordinate
+        for (auto& cigar : cigars) {
+            firstReadPosOfElement = lastReadPosOfElement;
+            firstRefPosOfElement = lastRefPosOfElement;
+            lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
+            lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
+            if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) {  // refCoord falls within this cigar element
+                int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
+                rc.cigarOp = cigar.op;
+                rc.readIdx = readPosAtRefCoord;
+                return rc;
+            }
+        }
+        return rc;
+    }
+};
--- a/src/util/stable_array.h
+++ b/src/util/stable_array.h
@ -0,0 +1,73 @@
+/*
+     Description: 不用经常释放array的内存空间，减少频繁的内存开辟和释放操作
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2025/12/29
+*/
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+using std::vector;
+
+// 不能用于bool类型, 因为在c++中vector<bool>是特化过的
+template <class T>
+struct StableArray {
+    vector<T> arr;
+    size_t idx = 0;
+    void clear() { idx = 0; }
+    size_t size() { return idx; }
+    bool empty() { return idx == 0; }
+    void reserve(size_t _size) {
+        if (arr.size() < _size)
+            arr.reserve(_size);
+    }
+    void resize(size_t _size) {
+        if (arr.size() < _size) {
+            arr.resize(_size);
+        }
+        idx = _size;
+    }
+
+    void resize(size_t _size, const T &val) {
+        if (arr.size() < _size) {
+            arr.resize(_size);
+        }
+        for (size_t i = 0; i < _size; ++i) {
+            arr[i] = val;
+        }
+        idx = _size;
+    }
+    void push_back(const T& val) {
+        if (idx < arr.size()) {
+            arr[idx++] = val;
+        } else {
+            arr.push_back(val);
+            idx++;
+        }
+    }
+    inline T& operator[](size_t pos) { return arr[pos]; }
+    //inline const T& operator[](size_t pos) { return arr[pos]; }
+    inline const T& operator[](size_t pos) const { return arr[pos]; }
+    struct iterator {
+        typename std::vector<T>::iterator it;
+        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
+        iterator& operator++() {
+            ++it;
+            return *this;
+        }
+        iterator& operator--() {
+            --it;
+            return *this;
+        }
+        T& operator*() const { return *it; }
+        bool operator!=(const iterator& other) const { return it != other.it; }
+        bool operator==(const iterator& other) const { return it == other.it; }
+    };
+
+    iterator begin() { return arr.begin(); }
+    iterator end() { return arr.begin() + idx; }
+};