串行版本还差最后一步，将信息合并到数据汇总表中

2025-12-24 12:47:26 +08:00 · 2025-12-24 12:47:26 +08:00 · 25f079b936
parent 1e9b58fac1
commit 25f079b936
9 changed files with 274 additions and 42 deletions
--- a/src/bqsr/baq.cpp
+++ b/src/bqsr/baq.cpp
@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
 vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1)));  // [ref][read][qual]

 // 计算baq数组，返回成功与否
-bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) {
+bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
    // 检测ref是否覆盖了read
    if (ref.size() < refOffset + ad.read_len) {
        spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
--- a/src/bqsr/baq.h
+++ b/src/bqsr/baq.h
@ -84,5 +84,5 @@ struct BAQ {
    double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }

    // 计算baq数组，返回成功与否
-    bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray);
+    bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
 };
--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -36,6 +36,9 @@ Date : 2023/10/23
 using std::deque;

 #define BAM_BLOCK_SIZE 16L * 1024 * 1024
+#define MAX_SITES_INTERVAL 100000
+
+const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';

 const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};

@ -210,7 +213,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar
 }

 // 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
-void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) {
+void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) {
    int start, stop;
    // Determine the read coordinate to start and stop hard clipping
    if (refStart < 0) {
@ -235,7 +238,7 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAddi
 }

 // 计算切掉adapter之后，ref相对原始ref的偏移量
-void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
+void calculateRefOffset(BamWrap *bw, SamData &ad) {
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int i = 0;
@ -253,16 +256,68 @@ void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
 }

 // 计算clip处理之后，剩余的碱基
-void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) {
+void calculateReadBases(BamWrap* bw, SamData& ad) {
    ad.bases.resize(ad.read_len);
+    ad.quals.resize(ad.read_len);
    uint8_t* seq = bam_get_seq(bw->b);
+    uint8_t* quals = bam_get_qual(bw->b);
    for (int i = 0; i < ad.read_len; ++i) {
        ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
+        ad.quals[i] = quals[i];
    }
 }

+// 计算read两端clip之后的softstart和softend
+void calculateSoftStartEnd(BamWrap* bw, SamData& ad) {
+    int64_t softStart = bw->b->core.pos + ad.ref_offset;
+    int64_t softEnd = softStart - 1; // 闭区间
+    const uint32_t* cigar = bam_get_cigar(bw->b);
+    const bam1_core_t& bc = bw->b->core;
+    int cigar_start = ad.cigar_start;
+    int cigar_end = ad.cigar_end;
+    bool rightTail = false;
+    for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
+        const char c = bam_cigar_opchr(cigar[i]);
+        int len = bam_cigar_oplen(cigar[i]);
+        if (i == ad.cigar_start) len -= ad.first_cigar_clip;
+        if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
+        // if (c == 'S' || c == 'I')
+        if (c == 'S')
+            softStart -= len;
+        else if (c != 'H')
+            rightTail = true;
+        if (rightTail) {
+            if (consumeRefBases(c) || c == 'S')
+                softEnd += len;
+        }
+    }
+    ad.softStart() = softStart;
+    ad.softEnd() = softEnd;
+}
+
+// 计算clip之后的cigar
+void calculateCigar(BamWrap* bw, SamData& ad) {
+    ad.cigars.clear();
+    const uint32_t* cigar = bam_get_cigar(bw->b);
+    const bam1_core_t& bc = bw->b->core;
+    int cigar_start = ad.cigar_start;
+    int cigar_end = ad.cigar_end;
+    for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
+        const char c = bam_cigar_opchr(cigar[i]);
+        int len = bam_cigar_oplen(cigar[i]);
+        if (i == ad.cigar_start)
+            len -= ad.first_cigar_clip;
+        if (i == ad.cigar_end - 1)
+            len -= ad.last_cigar_clip;
+        ad.cigars.push_back({c, len});
+    }
+    //for(auto &cigar : ad.cigars) {
+    //    spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
+    //}
+}
+
 // 计算read两端softclip的碱基数量，可能会修改ad里的clip值
-void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) {
+void calculateSoftClip(BamWrap *bw, SamData &ad) {
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int readIndex = ad.left_clip;
@ -324,10 +379,12 @@ inline void updateIndel(vector<int> &isIndel, int index) {
 }

 // 计算该read的每个碱基位置是否是SNP或Indel
-int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
+int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
    // 1. 读取参考基因组，先看看串行运行性能，稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
    Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
+    PROF_START(ref);
    read_ref_base(aux, interval.left, interval);
+    PROF_END(gprof[GP_read_ref], ref);
    string refBases(aux.ref_seq);
    // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());

@ -381,13 +438,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector
 }

 // 简单计算baq数组，就是全部赋值为'@' (64)
-bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) { 
+bool flatBAQArray(BamWrap* bw, SamData& ad, vector<int>& baqArray) { 
    baqArray.resize(ad.read_len, (int)'@'); 
    return true;
 }

 // 计算真实的baq数组，耗时更多，好像enable-baq参数默认是关闭的，那就先不实现这个了
-bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
+bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector<int>& baqArray) {
    baqArray.resize(ad.read_len, 0);
    return true;
 }
@ -404,7 +461,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
 }

 // 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
-void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) {
+void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
    int tid = bw->contig_id();
    uint64_t startPos = bw->start_pos();   // 闭区间
    uint64_t endPos = bw->end_pos(); // 闭区间
@ -427,6 +484,8 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
        //spdlog::info("before intervals : {}", vcf.knownSites.size());
        // 读取新的interval
        int64_t fpos, flen;
+        endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
+        Interval readIntv(startPos, endPos);
        vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
        //spdlog::info("file index: {}, {}", fpos, flen);
        if (flen > 0) {
@ -448,7 +507,9 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
                string id, ref;
                ss_line >> stid >> pos >> id >> ref;
                tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
-                if (tid >= 0 && pos > 0) {
+                int64_t varStart = BamWrap::bam_global_pos(tid, pos);
+                Interval varIntv(varStart, varStart + ref.size() - 1);
+                if (readIntv.overlaps(varIntv)) {
                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
                    //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
                }
@ -461,6 +522,74 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
        //}
    }
    //exit(0);
+    knownSites.resize(ad.read_len);
+    endPos = bw->end_pos();
+    
+    for(auto &vcf : vcfs) {
+        for (auto &intv : vcf.knownSites) {
+            // knownSite is outside clipping window for the read, ignore
+            if (intv.right < ad.softStart())
+                continue;
+            if (intv.left > ad.softEnd())
+                break;
+            // 计算对应的位点
+            ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left);
+            int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
+            if (rcStart.cigarOp == 'D') {
+                --featureStartOnRead;
+            }
+            ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right);
+            int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx;
+            if (featureStartOnRead > ad.read_len) {
+                featureStartOnRead = featureEndOnRead = ad.read_len;
+            }
+            for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) {
+                knownSites[i] = true;
+            }
+        }
+    }
+
+}
+
+// 应该是计算一段数据的平均值
+static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
+    int totalErrors = 0;
+    for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
+        totalErrors += errorArr[j];
+    }
+    for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
+        fracErrs[j] = ((double)totalErrors) / ((double)(i - max(0, blockStartIndex - 1) + 1));
+    }
+}
+
+// 应该是用来处理BAQ的，把不等于特定BAQ分数的碱基作为一段数据统一处理
+void calculateFractionalErrorArray(vector<int>& errorArr, vector<int>& baqArr, vector<double>& fracErrs) {
+    fracErrs.resize(baqArr.size());
+    // errorArray和baqArray必须长度相同
+    const int BLOCK_START_UNSET = -1;
+    bool inBlock = false;
+    int blockStartIndex = BLOCK_START_UNSET;
+    int i;
+    for (i = 0; i < fracErrs.size(); ++i) {
+        if (baqArr[i] == NO_BAQ_UNCERTAINTY) {  // 不等于NO_BAQ_UNCERTAINTY的碱基当成一段数据来处理
+            if (!inBlock) {
+                fracErrs[i] = errorArr[i];
+            } else {
+                calculateAndStoreErrorsInBlock(i, blockStartIndex, errorArr, fracErrs);
+                inBlock = false;  // reset state variables
+                blockStartIndex = BLOCK_START_UNSET;  // reset state variables
+            }
+        } else {
+            inBlock = true;
+            if (blockStartIndex == BLOCK_START_UNSET) {
+                blockStartIndex = i;
+            }
+        }
+    }
+    // 处理最后一个block
+    if (inBlock) {
+        calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
+    }
 }

 // 串行bqsr
@ -513,7 +642,7 @@ int SerialBQSR() {
            // 3. 如果bam文件之前做过bqsr，tag中包含OQ（originnal quality，原始质量分数），检查用户参数里是否指定用原始质量分数进行bqsr，如果是则将质量分数替换为OQ，否则忽略OQ，先忽略
            // 4. 对read的两端进行检测，去除（hardclip）adapter
            BamWrap *bw = bams[i];
-            ReadAdditionData ad;
+            SamData ad;
            ad.read_len = BamWrap::BamEffectiveLength(bw->b);
            ad.cigar_end = bw->b->core.n_cigar;
            if (ad.read_len <= 0) continue;
@ -535,6 +664,9 @@ int SerialBQSR() {

            calculateRefOffset(bw, ad); // 计算ref_offset，就是相对比对的position，要将ref右移多少
            calculateReadBases(bw, ad);  // 计算clip处理之后，剩余的碱基
+            // 计算clip之后，两端的softstart和softend
+            calculateSoftStartEnd(bw, ad);
+            calculateCigar(bw, ad);

            //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
            //             ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
@ -560,7 +692,9 @@ int SerialBQSR() {
            // 到这里，基本的数据都准备好了，后续就是进行bqsr的统计了

            // 8. 计算这条read对应的协变量
+            PROF_START(covariate);
            CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
+            PROF_END(gprof[GP_covariate], covariate);
            test = readCovariates[1][0][0] + readCovariates[2][1][3];
            int end_pos = bw->contig_end_pos();
            //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
@ -568,7 +702,23 @@ int SerialBQSR() {
        
            // 9. 计算这条read需要跳过的位置
            vector<bool> skip(ad.read_len, 0);
-            calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr);
+            PROF_START(known_sites);
+            calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skip);
+            for (int ii = 0; ii < ad.read_len; ++ii) {
+                skip[ii] =
+                    skip[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || ad.quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
+            }
+            PROF_END(gprof[GP_read_vcf], known_sites);
+
+            // 10. 根据BAQ进一步处理snp，indel，得到处理后的数据
+            vector<double> snpErrors, insErrors, delErrors;
+            calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
+            calculateFractionalErrorArray(isIns, baqArray, insErrors);
+            calculateFractionalErrorArray(isDel, baqArray, delErrors);
+
+            // aggregate all of the info into our info object, and update the data
+            // 11. 合并之前计算的数据，得到info，并更新bqsr table数据
+
        }

 #if 0
--- a/src/bqsr/bqsr_pipeline.h
+++ b/src/bqsr/bqsr_pipeline.h
@ -9,7 +9,7 @@
 struct ReadData {
    vector<BamWrap *> bams;   // read step output
    int64_t bamStartIdx = 0;  // bam，bam
-    int64_t taskSeq = 0;      // 
+    int64_t taskSeq = 0;      //
 };

 struct GenREData {
--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -22,16 +22,16 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE;

 // for CovariateUtils
 // 对一条read计算协变量（该协变量被上一个read用过）
-void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
+void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
                                            bool recordIndelValues) {
-    //ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
-    //BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
-    //ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
-    //CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
 }

 // ReadGroupCovariate 协变量的方法
-void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    uint8_t *rgStr = bam_aux_get(bw->b, "RG");
    char* rgVal = nullptr;
    if (rgStr) rgVal = bam_aux2Z(rgStr);
@ -47,7 +47,7 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr
 }

 // BaseQualityCovariate 协变量的方法
-void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
+void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
                                        bool recordIndelValues) {
    // 在前面的处理过后，quals应该和base长度一致了
 #define __bq_set_cov(ins, del)                                                                                           \
@ -98,7 +98,7 @@ static char SimpleComplement(const char base) {
 }

 // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
-void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) {
+void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
    uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;

    if (bw->GetReadNegativeStrandFlag()) {  // 反向互补
@ -218,7 +218,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i
    }
 }

-void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int originalReadLength = ad.read_len;

    // store the original bases and then write Ns over low quality ones
@ -270,7 +270,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t
 * @param maxCycle max value of the base to compute the key for
 *                 (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
 */
-int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) {
+int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
    const bool isNegStrand = bw->GetReadNegativeStrandFlag();
    const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
    const int readLength = ad.read_len;
@ -299,7 +299,7 @@ int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNu
 }

 // Used to pick out the covariate's value from attributes of the read
-void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int readLength = ad.read_len;
    // Note: duplicate the loop to void checking recordIndelValues on every iteration
    if (recordIndelValues) {
--- a/src/bqsr/covariate.h
+++ b/src/bqsr/covariate.h
@ -69,7 +69,7 @@ struct CovariateUtils {
    }

    // 对一条read计算协变量（该协变量被上一个read用过）
-    static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };

 // Read group协变量
@ -78,13 +78,13 @@ struct ReadGroupCovariate {
    static map<string, int> RgToId;  // read group name到id的映射
    static map<int, string> IdToRg;  // id到read group name的映射

-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };

 // Base quality协变量
 struct BaseQualityCovariate {
    static constexpr int index = 1;  // 在协变量数组中的索引位置
-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };

 // Context协变量
@ -162,14 +162,14 @@ struct ContextCovariate {
    }

    // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
-    static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail);
+    static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail);
    // Creates a int representation of a given dna string.
    static int KeyFromContext(const string& dna, const int start, const int end);
    // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
    static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);

    // 设置协变量的值
-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };

 // Cycle协变量
@ -203,7 +203,7 @@ struct CycleCovariate {
 }

    // Computes the encoded value of CycleCovariate's key for the given position at the read.
-    static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle);
+    static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle);

-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };
--- a/src/util/bam_wrap.h
+++ b/src/util/bam_wrap.h
@ -11,16 +11,61 @@
 #include <htslib/sam.h>
 #include <limits.h>
 #include <math.h>
+#include <stdlib.h>

 #include <map>
+#include <queue>
 #include <sstream>
 #include <string>
 #include <vector>

 using namespace std;

+struct Cigar {
+    char op = '0';
+    int len = 0;
+    // 该操作符是否消耗read的碱基
+    static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
+    // 该操作符是否消耗参考基因组的碱基
+    static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
+};
+
+struct ReadIdxCigar {
+    int readIdx = 0; // 在read序列中的位置
+    char cigarOp = '0'; // 当前位置对应的cigar
+};
+
+// 不用经常释放array的内存空间，减少频繁的内存开辟和释放操作
+template <class T>
+struct FastArray {
+    vector<T> arr;
+    size_t idx;
+    void clear() { idx = 0; }
+    size_t size() { return idx; }
+    void push_back(const T& val) {
+        if (idx < arr.size()) {
+            arr[idx++] = val;
+        } else {
+            arr.push_back(val);
+            idx++;
+        }
+    }
+    struct iterator {
+        typename std::vector<T>::iterator it;
+        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
+        iterator& operator++() { ++it; return *this;}
+        iterator& operator--() { --it; return *this;}
+        T& operator*() const { return *it; }
+        bool operator!=(const iterator& other) const { return it != other.it; }
+        bool operator==(const iterator& other) const { return it == other.it; }
+    };
+
+    iterator begin() { return arr.begin(); }
+    iterator end() { return arr.begin() + idx; }
+};
+
 // 对原始bam数据的补充，比如对两端进行hardclip等
-struct ReadAdditionData {
+struct SamData {
    int read_len = 0;          // read长度，各种clip之后的长度
    int cigar_start = 0;       // cigar起始位置，闭区间
    int cigar_end = 0;         // cigar结束位置，开区间
@ -29,7 +74,41 @@ struct ReadAdditionData {
    int left_clip = 0;         // 左侧被切掉的碱基长度
    int right_clip = 0;        // 右侧被切掉的碱基长度
    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
-    string bases;              // 处理之后的read的碱基
+
+    // 记录一下bqsr运算过程中用到的数据，回头提前计算一下，修正现在的复杂逻辑
+    static constexpr int READ_INDEX_NOT_FOUND = -1;
+        string bases;              // 处理之后的read的碱基
+        vector<uint8_t> quals;     // 对应的质量分数
+        int64_t start_pos;  // 因为soft clip都被切掉了，这里的softstart应该就是切掉之后的匹配位点，闭区间
+        int64_t end_pos;    // 同上，闭区间
+        FastArray<Cigar> cigars;
+        int64_t& softStart() { return start_pos; }
+        int64_t& softEnd() { return end_pos; }
+
+        // functions
+        ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
+            ReadIdxCigar rc;
+            if (refPos < start_pos)
+                return rc;
+            int firstReadPosOfElement = 0;         // inclusive
+            int firstRefPosOfElement = start_pos;  // inclusive
+            int lastReadPosOfElement = 0;          // exclusive
+            int lastRefPosOfElement = start_pos;   // exclusive
+            // advance forward through all the cigar elements until we bracket the reference coordinate
+            for (auto& cigar : cigars) {
+                firstReadPosOfElement = lastReadPosOfElement;
+                firstRefPosOfElement = lastRefPosOfElement;
+                lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
+                lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
+                if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) {  // refCoord falls within this cigar element
+                    int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
+                    rc.cigarOp = cigar.op;
+                    rc.readIdx = readPosAtRefCoord;
+                    return rc;
+                }
+            }
+            return rc;
+        }
 };

 /*
--- a/src/util/profiling.cpp
+++ b/src/util/profiling.cpp
@ -54,11 +54,11 @@ int DisplayProfiling(int nthread) {
    // PRINT_GP(sort_wait);
    // PRINT_GP(markdup_wait);
    // PRINT_GP(intersect_wait);
-    PRINT_GP(read);
-    PRINT_GP(gen);
-    PRINT_GP(sort);
-    PRINT_GP(markdup);
-    PRINT_GP(intersect);
+    PRINT_GP(read_ref);
+    PRINT_GP(read_vcf);
+    PRINT_GP(covariate);
+    //PRINT_GP(markdup);
+    //PRINT_GP(intersect);
    // PRINT_GP(merge_result);
    // PRINT_GP(sort_pair);
    // PRINT_GP(sort_frag);
@ -68,10 +68,10 @@ int DisplayProfiling(int nthread) {
    // PRINT_GP(merge_markdup);
    // PRINT_GP(merge_update);
    // PRINT_GP(merge_add);
-    PRINT_GP(markdup_all);
+    //PRINT_GP(markdup_all);
    // PRINT_GP(final_read);
-    PRINT_GP(write);
-    PRINT_GP(whole_process);
+    //PRINT_GP(write);
+    // PRINT_GP(whole_process);
    
    // PRINT_TP(gen, nthread);
    // PRINT_TP(sort_frag, nthread);
--- a/src/util/profiling.h
+++ b/src/util/profiling.h
@ -40,6 +40,9 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
 enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
 enum {
    GP_read_wait = 11,
+    GP_covariate,
+    GP_read_ref,
+    GP_read_vcf,
    GP_gen_wait,
    GP_sort_wait,
    GP_markdup_wait,