串行版本还差最后一步，将信息合并到数据汇总表中

2025-12-24 12:47:26 +08:00 · 2025-12-24 12:47:26 +08:00 · 25f079b936
parent 1e9b58fac1
commit 25f079b936
9 changed files with 274 additions and 42 deletions
--- a/src/bqsr/baq.cpp
+++ b/src/bqsr/baq.cpp
@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
 vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1)));  // [ref][read][qual]
 // 计算baq数组，返回成功与否
-bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) {
+bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
    // 检测ref是否覆盖了read
    if (ref.size() < refOffset + ad.read_len) {
        spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
--- a/src/bqsr/baq.h
+++ b/src/bqsr/baq.h
@ -84,5 +84,5 @@ struct BAQ {
    double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
    // 计算baq数组，返回成功与否
-    bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray);
+    bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
 };
--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -36,6 +36,9 @@ Date : 2023/10/23
 using std::deque;
 #define BAM_BLOCK_SIZE 16L * 1024 * 1024
 #define MAX_SITES_INTERVAL 100000
 const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
 const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
@ -210,7 +213,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar
 }
 // 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
-void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) {
+void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) {
    int start, stop;
    // Determine the read coordinate to start and stop hard clipping
    if (refStart < 0) {
@ -235,7 +238,7 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAddi
 }
 // 计算切掉adapter之后，ref相对原始ref的偏移量
-void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
+void calculateRefOffset(BamWrap *bw, SamData &ad) {
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int i = 0;
@ -253,16 +256,68 @@ void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
 }
 // 计算clip处理之后，剩余的碱基
-void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) {
+void calculateReadBases(BamWrap* bw, SamData& ad) {
    ad.bases.resize(ad.read_len);
    ad.quals.resize(ad.read_len);
    uint8_t* seq = bam_get_seq(bw->b);
    uint8_t* quals = bam_get_qual(bw->b);
    for (int i = 0; i < ad.read_len; ++i) {
        ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
        ad.quals[i] = quals[i];
    }
 }
 // 计算read两端clip之后的softstart和softend
 void calculateSoftStartEnd(BamWrap* bw, SamData& ad) {
    int64_t softStart = bw->b->core.pos + ad.ref_offset;
    int64_t softEnd = softStart - 1; // 闭区间
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int cigar_start = ad.cigar_start;
    int cigar_end = ad.cigar_end;
    bool rightTail = false;
    for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
        const char c = bam_cigar_opchr(cigar[i]);
        int len = bam_cigar_oplen(cigar[i]);
        if (i == ad.cigar_start) len -= ad.first_cigar_clip;
        if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
        // if (c == 'S' || c == 'I')
        if (c == 'S')
            softStart -= len;
        else if (c != 'H')
            rightTail = true;
        if (rightTail) {
            if (consumeRefBases(c) || c == 'S')
                softEnd += len;
        }
    }
    ad.softStart() = softStart;
    ad.softEnd() = softEnd;
 }
 // 计算clip之后的cigar
 void calculateCigar(BamWrap* bw, SamData& ad) {
    ad.cigars.clear();
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int cigar_start = ad.cigar_start;
    int cigar_end = ad.cigar_end;
    for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
        const char c = bam_cigar_opchr(cigar[i]);
        int len = bam_cigar_oplen(cigar[i]);
        if (i == ad.cigar_start)
            len -= ad.first_cigar_clip;
        if (i == ad.cigar_end - 1)
            len -= ad.last_cigar_clip;
        ad.cigars.push_back({c, len});
    }
    //for(auto &cigar : ad.cigars) {
    //    spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
    //}
 }
 // 计算read两端softclip的碱基数量，可能会修改ad里的clip值
-void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) {
+void calculateSoftClip(BamWrap *bw, SamData &ad) {
    const uint32_t* cigar = bam_get_cigar(bw->b);
    const bam1_core_t& bc = bw->b->core;
    int readIndex = ad.left_clip;
@ -324,10 +379,12 @@ inline void updateIndel(vector<int> &isIndel, int index) {
 }
 // 计算该read的每个碱基位置是否是SNP或Indel
-int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
+int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
    // 1. 读取参考基因组，先看看串行运行性能，稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
    Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
    PROF_START(ref);
    read_ref_base(aux, interval.left, interval);
    PROF_END(gprof[GP_read_ref], ref);
    string refBases(aux.ref_seq);
    // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
@ -381,13 +438,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector
 }
 // 简单计算baq数组，就是全部赋值为'@' (64)
-bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) { 
+bool flatBAQArray(BamWrap* bw, SamData& ad, vector<int>& baqArray) { 
    baqArray.resize(ad.read_len, (int)'@'); 
    return true;
 }
 // 计算真实的baq数组，耗时更多，好像enable-baq参数默认是关闭的，那就先不实现这个了
-bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
+bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector<int>& baqArray) {
    baqArray.resize(ad.read_len, 0);
    return true;
 }
@ -404,7 +461,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
 }
 // 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
-void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) {
+void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
    int tid = bw->contig_id();
    uint64_t startPos = bw->start_pos();   // 闭区间
    uint64_t endPos = bw->end_pos(); // 闭区间
@ -427,6 +484,8 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
        //spdlog::info("before intervals : {}", vcf.knownSites.size());
        // 读取新的interval
        int64_t fpos, flen;
        endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
        Interval readIntv(startPos, endPos);
        vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
        //spdlog::info("file index: {}, {}", fpos, flen);
        if (flen > 0) {
@ -448,7 +507,9 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
                string id, ref;
                ss_line >> stid >> pos >> id >> ref;
                tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
-                if (tid >= 0 && pos > 0) {
+                int64_t varStart = BamWrap::bam_global_pos(tid, pos);
                Interval varIntv(varStart, varStart + ref.size() - 1);
                if (readIntv.overlaps(varIntv)) {
                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
                    //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
                }
@ -461,6 +522,74 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
        //}
    }
    //exit(0);
    knownSites.resize(ad.read_len);
    endPos = bw->end_pos();
    for(auto &vcf : vcfs) {
        for (auto &intv : vcf.knownSites) {
            // knownSite is outside clipping window for the read, ignore
            if (intv.right < ad.softStart())
                continue;
            if (intv.left > ad.softEnd())
                break;
            // 计算对应的位点
            ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left);
            int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
            if (rcStart.cigarOp == 'D') {
                --featureStartOnRead;
            }
            ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right);
            int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx;
            if (featureStartOnRead > ad.read_len) {
                featureStartOnRead = featureEndOnRead = ad.read_len;
            }
            for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) {
                knownSites[i] = true;
            }
        }
    }
 }
 // 应该是计算一段数据的平均值
 static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
    int totalErrors = 0;
    for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
        totalErrors += errorArr[j];
    }
    for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
        fracErrs[j] = ((double)totalErrors) / ((double)(i - max(0, blockStartIndex - 1) + 1));
    }
 }
 // 应该是用来处理BAQ的，把不等于特定BAQ分数的碱基作为一段数据统一处理
 void calculateFractionalErrorArray(vector<int>& errorArr, vector<int>& baqArr, vector<double>& fracErrs) {
    fracErrs.resize(baqArr.size());
    // errorArray和baqArray必须长度相同
    const int BLOCK_START_UNSET = -1;
    bool inBlock = false;
    int blockStartIndex = BLOCK_START_UNSET;
    int i;
    for (i = 0; i < fracErrs.size(); ++i) {
        if (baqArr[i] == NO_BAQ_UNCERTAINTY) {  // 不等于NO_BAQ_UNCERTAINTY的碱基当成一段数据来处理
            if (!inBlock) {
                fracErrs[i] = errorArr[i];
            } else {
                calculateAndStoreErrorsInBlock(i, blockStartIndex, errorArr, fracErrs);
                inBlock = false;  // reset state variables
                blockStartIndex = BLOCK_START_UNSET;  // reset state variables
            }
        } else {
            inBlock = true;
            if (blockStartIndex == BLOCK_START_UNSET) {
                blockStartIndex = i;
            }
        }
    }
    // 处理最后一个block
    if (inBlock) {
        calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
    }
 }
 // 串行bqsr
@ -513,7 +642,7 @@ int SerialBQSR() {
            // 3. 如果bam文件之前做过bqsr，tag中包含OQ（originnal quality，原始质量分数），检查用户参数里是否指定用原始质量分数进行bqsr，如果是则将质量分数替换为OQ，否则忽略OQ，先忽略
            // 4. 对read的两端进行检测，去除（hardclip）adapter
            BamWrap *bw = bams[i];
-            ReadAdditionData ad;
+            SamData ad;
            ad.read_len = BamWrap::BamEffectiveLength(bw->b);
            ad.cigar_end = bw->b->core.n_cigar;
            if (ad.read_len <= 0) continue;
@ -535,6 +664,9 @@ int SerialBQSR() {
            calculateRefOffset(bw, ad); // 计算ref_offset，就是相对比对的position，要将ref右移多少
            calculateReadBases(bw, ad);  // 计算clip处理之后，剩余的碱基
            // 计算clip之后，两端的softstart和softend
            calculateSoftStartEnd(bw, ad);
            calculateCigar(bw, ad);
            //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
            //             ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
@ -560,7 +692,9 @@ int SerialBQSR() {
            // 到这里，基本的数据都准备好了，后续就是进行bqsr的统计了
            // 8. 计算这条read对应的协变量
            PROF_START(covariate);
            CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
            PROF_END(gprof[GP_covariate], covariate);
            test = readCovariates[1][0][0] + readCovariates[2][1][3];
            int end_pos = bw->contig_end_pos();
            //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
@ -568,7 +702,23 @@ int SerialBQSR() {
            // 9. 计算这条read需要跳过的位置
            vector<bool> skip(ad.read_len, 0);
-            calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr);
+            PROF_START(known_sites);
            calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skip);
            for (int ii = 0; ii < ad.read_len; ++ii) {
                skip[ii] =
                    skip[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || ad.quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
            }
            PROF_END(gprof[GP_read_vcf], known_sites);
            // 10. 根据BAQ进一步处理snp，indel，得到处理后的数据
            vector<double> snpErrors, insErrors, delErrors;
            calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
            calculateFractionalErrorArray(isIns, baqArray, insErrors);
            calculateFractionalErrorArray(isDel, baqArray, delErrors);
            // aggregate all of the info into our info object, and update the data
            // 11. 合并之前计算的数据，得到info，并更新bqsr table数据
        }
 #if 0
--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -22,16 +22,16 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE;
 // for CovariateUtils
 // 对一条read计算协变量（该协变量被上一个read用过）
-void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
+void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
                                            bool recordIndelValues) {
-    //ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
-    //BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
-    //ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
-    //CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
 }
 // ReadGroupCovariate 协变量的方法
-void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    uint8_t *rgStr = bam_aux_get(bw->b, "RG");
    char* rgVal = nullptr;
    if (rgStr) rgVal = bam_aux2Z(rgStr);
@ -47,7 +47,7 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr
 }
 // BaseQualityCovariate 协变量的方法
-void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
+void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
                                        bool recordIndelValues) {
    // 在前面的处理过后，quals应该和base长度一致了
 #define __bq_set_cov(ins, del)                                                                                           \
@ -98,7 +98,7 @@ static char SimpleComplement(const char base) {
 }
 // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
-void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) {
+void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
    uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
    if (bw->GetReadNegativeStrandFlag()) {  // 反向互补
@ -218,7 +218,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i
    }
 }
-void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int originalReadLength = ad.read_len;
    // store the original bases and then write Ns over low quality ones
@ -270,7 +270,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t
 * @param maxCycle max value of the base to compute the key for
 *                 (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
 */
-int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) {
+int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
    const bool isNegStrand = bw->GetReadNegativeStrandFlag();
    const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
    const int readLength = ad.read_len;
@ -299,7 +299,7 @@ int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNu
 }
 // Used to pick out the covariate's value from attributes of the read
-void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int readLength = ad.read_len;
    // Note: duplicate the loop to void checking recordIndelValues on every iteration
    if (recordIndelValues) {
--- a/src/bqsr/covariate.h
+++ b/src/bqsr/covariate.h
@ -69,7 +69,7 @@ struct CovariateUtils {
    }
    // 对一条read计算协变量（该协变量被上一个read用过）
-    static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };
 // Read group协变量
@ -78,13 +78,13 @@ struct ReadGroupCovariate {
    static map<string, int> RgToId;  // read group name到id的映射
    static map<int, string> IdToRg;  // id到read group name的映射
-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };
 // Base quality协变量
 struct BaseQualityCovariate {
    static constexpr int index = 1;  // 在协变量数组中的索引位置
-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };
 // Context协变量
@ -162,14 +162,14 @@ struct ContextCovariate {
    }
    // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
-    static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail);
+    static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail);
    // Creates a int representation of a given dna string.
    static int KeyFromContext(const string& dna, const int start, const int end);
    // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
    static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
    // 设置协变量的值
-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };
 // Cycle协变量
@ -203,7 +203,7 @@ struct CycleCovariate {
 }
    // Computes the encoded value of CycleCovariate's key for the given position at the read.
-    static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle);
+    static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
-    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+    static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
 };
--- a/src/util/bam_wrap.h
+++ b/src/util/bam_wrap.h
@ -11,16 +11,61 @@
 #include <htslib/sam.h>
 #include <limits.h>
 #include <math.h>
 #include <stdlib.h>
 #include <map>
 #include <queue>
 #include <sstream>
 #include <string>
 #include <vector>
 using namespace std;
 struct Cigar {
    char op = '0';
    int len = 0;
    // 该操作符是否消耗read的碱基
    static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
    // 该操作符是否消耗参考基因组的碱基
    static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
 };
 struct ReadIdxCigar {
    int readIdx = 0; // 在read序列中的位置
    char cigarOp = '0'; // 当前位置对应的cigar
 };
 // 不用经常释放array的内存空间，减少频繁的内存开辟和释放操作
 template <class T>
 struct FastArray {
    vector<T> arr;
    size_t idx;
    void clear() { idx = 0; }
    size_t size() { return idx; }
    void push_back(const T& val) {
        if (idx < arr.size()) {
            arr[idx++] = val;
        } else {
            arr.push_back(val);
            idx++;
        }
    }
    struct iterator {
        typename std::vector<T>::iterator it;
        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
        iterator& operator++() { ++it; return *this;}
        iterator& operator--() { --it; return *this;}
        T& operator*() const { return *it; }
        bool operator!=(const iterator& other) const { return it != other.it; }
        bool operator==(const iterator& other) const { return it == other.it; }
    };
    iterator begin() { return arr.begin(); }
    iterator end() { return arr.begin() + idx; }
 };
 // 对原始bam数据的补充，比如对两端进行hardclip等
-struct ReadAdditionData {
+struct SamData {
    int read_len = 0;          // read长度，各种clip之后的长度
    int cigar_start = 0;       // cigar起始位置，闭区间
    int cigar_end = 0;         // cigar结束位置，开区间
@ -29,7 +74,41 @@ struct ReadAdditionData {
    int left_clip = 0;         // 左侧被切掉的碱基长度
    int right_clip = 0;        // 右侧被切掉的碱基长度
    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
-    string bases;              // 处理之后的read的碱基
+
    // 记录一下bqsr运算过程中用到的数据，回头提前计算一下，修正现在的复杂逻辑
    static constexpr int READ_INDEX_NOT_FOUND = -1;
        string bases;              // 处理之后的read的碱基
        vector<uint8_t> quals;     // 对应的质量分数
        int64_t start_pos;  // 因为soft clip都被切掉了，这里的softstart应该就是切掉之后的匹配位点，闭区间
        int64_t end_pos;    // 同上，闭区间
        FastArray<Cigar> cigars;
        int64_t& softStart() { return start_pos; }
        int64_t& softEnd() { return end_pos; }
        // functions
        ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
            ReadIdxCigar rc;
            if (refPos < start_pos)
                return rc;
            int firstReadPosOfElement = 0;         // inclusive
            int firstRefPosOfElement = start_pos;  // inclusive
            int lastReadPosOfElement = 0;          // exclusive
            int lastRefPosOfElement = start_pos;   // exclusive
            // advance forward through all the cigar elements until we bracket the reference coordinate
            for (auto& cigar : cigars) {
                firstReadPosOfElement = lastReadPosOfElement;
                firstRefPosOfElement = lastRefPosOfElement;
                lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
                lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
                if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) {  // refCoord falls within this cigar element
                    int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
                    rc.cigarOp = cigar.op;
                    rc.readIdx = readPosAtRefCoord;
                    return rc;
                }
            }
            return rc;
        }
 };
 /*
--- a/src/util/profiling.cpp
+++ b/src/util/profiling.cpp
@ -54,11 +54,11 @@ int DisplayProfiling(int nthread) {
    // PRINT_GP(sort_wait);
    // PRINT_GP(markdup_wait);
    // PRINT_GP(intersect_wait);
-    PRINT_GP(read);
+    PRINT_GP(read_ref);
-    PRINT_GP(gen);
+    PRINT_GP(read_vcf);
-    PRINT_GP(sort);
+    PRINT_GP(covariate);
-    PRINT_GP(markdup);
+    //PRINT_GP(markdup);
-    PRINT_GP(intersect);
+    //PRINT_GP(intersect);
    // PRINT_GP(merge_result);
    // PRINT_GP(sort_pair);
    // PRINT_GP(sort_frag);
@ -68,10 +68,10 @@ int DisplayProfiling(int nthread) {
    // PRINT_GP(merge_markdup);
    // PRINT_GP(merge_update);
    // PRINT_GP(merge_add);
-    PRINT_GP(markdup_all);
+    //PRINT_GP(markdup_all);
    // PRINT_GP(final_read);
-    PRINT_GP(write);
+    //PRINT_GP(write);
-    PRINT_GP(whole_process);
+    // PRINT_GP(whole_process);
    // PRINT_TP(gen, nthread);
    // PRINT_TP(sort_frag, nthread);
--- a/src/util/profiling.h
+++ b/src/util/profiling.h
@ -40,6 +40,9 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
 enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
 enum {
    GP_read_wait = 11,
    GP_covariate,
    GP_read_ref,
    GP_read_vcf,
    GP_gen_wait,
    GP_sort_wait,
    GP_markdup_wait,