调整了计算read group的方式，当read group只有一个时，不需要计算

2026-01-01 09:40:46 +08:00 · 2026-01-01 09:40:46 +08:00 · 985875ebac
parent 745963831d
commit 985875ebac
4 changed files with 20 additions and 20 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,5 +5,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 # set(CMAKE_BUILD_TYPE Debug)
 # set(CMAKE_BUILD_TYPE Release)
-# add_definitions(-DSHOW_PERF=1)
+add_definitions(-DSHOW_PERF)
+#add_definitions(-DSHOW_PERF=1)
 add_subdirectory(src)
--- a/src/bqsr/bqsr_args.h
+++ b/src/bqsr/bqsr_args.h
@ -28,7 +28,8 @@ struct BQSRArg {

    int NUM_THREADS = 1;

-    size_t MAX_MEM = ((size_t)1) << 30;  // 1G
+    size_t MAX_MEM = ((size_t)1 << 30);
+   // + ((size_t)1 << 29);  // 1G 应该按照每个线程多少内存来设置

    bool DUPLEX_IO = true;  //

--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -121,18 +121,21 @@ void CovariateUtils::ComputeCovariates(SamData& sd, sam_hdr_t* header, PerReadCo

 // ReadGroupCovariate 协变量的方法
 void ReadGroupCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
-    uint8_t *rgStr = bam_aux_get(sd.bw->b, "RG");
-    char* rgVal = nullptr;
-    if (rgStr) rgVal = bam_aux2Z(rgStr);
    int key = 0;
-    if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
-        spdlog::error("The RG tag value for read can not be found in header!");
-    } else {
-        key = RgToId[rgVal];
-    }
-    for (int i = 0; i < sd.read_len; ++i) {
-        CovariateUtils::SetReadGroup(key, key, key, i, values);
-    }
+    if (RgToId.size() > 1) { // 只有当bam文件有多个read group时，再读取每个read的RG tag
+        uint8_t* rgStr = bam_aux_get(sd.bw->b, "RG");
+        char* rgVal = nullptr;
+        if (rgStr)
+            rgVal = bam_aux2Z(rgStr);
+        if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
+            spdlog::error("The RG tag value for read can not be found in header!");
+        } else {
+            key = RgToId[rgVal];
+            for (int i = 0; i < sd.read_len; ++i) {
+                CovariateUtils::SetReadGroup(key, key, key, i, values);
+            }
+        }
+    } // 因为read group默认就是0,所以只有一个read group时，不需要设置
 }

 // BaseQualityCovariate 协变量的方法
@ -332,11 +335,9 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
    const int originalReadLength = sd.read_len;

    // store the original bases and then write Ns over low quality ones
-    // string strandedClippedBases(sd.bases.arr.data(), sd.read_len);
    auto &strandedClippedBases = sd.strandedClippedBases;
    strandedClippedBases.copy(sd.bases);
-    // GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
-    GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail);  // 命名我之前看到过这个30的？
+    GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail);  // 明明我之前看到过这个30的？
    // spdlog::info("bases: {}", strandedClippedBases);
    auto& nBasePairContextAtEachCycle = sd.nBasePairContextAtEachCycle;
    GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
@ -346,6 +347,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
    // this is necessary to ensure that we don't keep historical data in the ReadCovariates values
    // since the context covariate may not span the entire set of values in read covariates
    // due to the clipping of the low quality bases
+    // 这段代码应该不会执行，因为clip with N不会改变read长度
    if (readLengthAfterClipping != originalReadLength) {
        // don't bother zeroing out if we are going to overwrite the whole array
        for (int i = 0; i < originalReadLength; i++) {
--- a/src/bqsr/recal_funcs.h
+++ b/src/bqsr/recal_funcs.h
@ -116,10 +116,6 @@ struct RecalFuncs {
        // int idx = 0;
        PROF_START(TP_read_vcf);
        for (auto& vcf : vcfs) {
-            // 为啥多线程环境会出现，deque的front和[0]不一样？好像是调试的时候的问题，实际运行时没再出现
-            // if (vcf.knownSites.front().left != vcf.knownSites[0].left || vcf.knownSites.front().right != vcf.knownSites[0].right)
-            //    spdlog::error("front is different with [0]: {} - {}", vcf.knownSites.front().left, vcf.knownSites[0].left);
-
            if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam，不需要读取
                // 清理旧的interval，只有此时才有清理的必要
                while (!vcf.knownSites.empty()) {