上一个commit为啥没有包含所有更改文件

在小数据集上结果与gatk一致（版本gatk-package-4.6.2.0-17）
2025-12-28 23:06:03 +08:00 · 2025-12-28 23:04:45 +08:00
8 changed files with 152 additions and 1124 deletions
--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -38,6 +38,7 @@ Date : 2023/10/23
 #include "util/utils.h"
 #include "util/math/math_utils.h"
 #include "quant_info.h"
+#include "util/debug.h"

 using std::deque;

@ -270,7 +271,7 @@ void calculateReadBases(BamWrap* bw, SamData& ad) {
    uint8_t* quals = bam_get_qual(bw->b);
    for (int i = 0; i < ad.read_len; ++i) {
        ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
-        ad.base_quals[i] = quals[i];
+        ad.base_quals[i] = quals[i + ad.left_clip];
    }
 }

@ -470,7 +471,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
 }

 // 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
-void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
+void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
    int tid = bw->contig_id();
    uint64_t startPos = bw->start_pos();   // 闭区间
    uint64_t endPos = bw->end_pos(); // 闭区间
@ -519,7 +520,7 @@ void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vect
                int64_t varStart = BamWrap::bam_global_pos(tid, pos);
                Interval varIntv(varStart, varStart + ref.size() - 1);
                if (readIntv.overlaps(varIntv)) {
-                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
+                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
                    //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
                }
                get_line_from_buf(buf, flen, &cur, &line);
@ -619,7 +620,8 @@ void updateRecalTablesForRead(ReadRecalInfo &info) {

    int readLength = read.read_len;
    for (int offset = 0; offset < readLength; ++offset) {
-        if (!info.skips[offset]) { // 不跳过当前位置
+        if (!info.skips[offset]) {
+        //if (true){ // 不跳过当前位置
            for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) {
                // 获取四个值，readgroup / qualityscore / context / cycle
                EventTypeValue& event = nsgv::gEventTypes[idx];
@ -693,6 +695,7 @@ void roundTableValues(RecalTables& rt) {

 // 串行bqsr
 int SerialBQSR() {
+    open_debug_files();
    int round = 0;
    BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
    // inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
@ -754,6 +757,7 @@ int SerialBQSR() {
            // spdlog::info("bam idx: {}", i);
            BamWrap* bw = bams[i];
            SamData ad;
+            ad.rid = i;
            ad.bw = bw;
            ad.read_len = BamWrap::BamEffectiveLength(bw->b);
            ad.cigar_end = bw->b->core.n_cigar;
@ -788,6 +792,18 @@ int SerialBQSR() {
            vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置，0不是，1是
            vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置，0不是，1是
            const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
+
+            /*fprintf(gf[0], "%d\t", ad.read_len);
+            for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
+            fprintf(gf[0], "\n");
+            fprintf(gf[1], "%d\t", ad.read_len);
+            for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
+            fprintf(gf[1], "\n");
+            fprintf(gf[2], "%d\t", ad.read_len);
+            for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
+            fprintf(gf[2], "\n");
+            */
+
            // spdlog::info("nErrors: {}", nErrors);
            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }

@ -811,8 +827,19 @@ int SerialBQSR() {
            PROF_START(covariate);
            CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
            PROF_END(gprof[GP_covariate], covariate);
-            test = readCovariates[1][0][0] + readCovariates[2][1][3];
-            int end_pos = bw->contig_end_pos();
+
+            // fprintf(gf[3], "%ld %ld %d %ld\n", ad.rid, readCovariates.size(), ad.read_len, readCovariates[0][0].size());
+            // for (auto &arr1 : readCovariates) {
+            //     for (size_t si = 0; si < ad.read_len; ++si) {
+            //         for (auto &val : arr1[si]) {
+            //             fprintf(gf[3], "%d ", val);
+            //         }
+            //     }
+            // }
+            // fprintf(gf[3], "\n");
+
+            //test = readCovariates[1][0][0] + readCovariates[2][1][3];
+            //int end_pos = bw->contig_end_pos();
            //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
            //             bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
            // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
@ -825,6 +852,9 @@ int SerialBQSR() {
                            ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
            }
            PROF_END(gprof[GP_read_vcf], known_sites);
+            // fprintf(gf[0], "%d\t", ad.read_len);
+            // for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
+            // fprintf(gf[0], "\n");

            // 10. 根据BAQ进一步处理snp，indel，得到处理后的数据
            vector<double> snpErrors, insErrors, delErrors;
@ -911,6 +941,8 @@ int SerialBQSR() {
    }
    spdlog::info("read count: {}", readNumSum);

+    close_debug_files();
+
    return 0;
 }

--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -98,6 +98,51 @@ static char SimpleComplement(const char base) {
    }
 }

+static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
+    // 处理左边
+    int left = 0;
+    int readLen = bases.size();
+    if (isNegativeStrand) {
+        for (; left < readLen; ++left) {
+            if (quals[readLen - 1 - left] <= lowQTail)
+                bases[left] = 'N';
+            else
+                break;
+        }
+    } else {
+        for (; left < readLen; ++left) {
+            if (quals[left] <= lowQTail)
+                bases[left] = 'N';
+            else
+                break;
+        }
+    }
+    if (left == readLen) {
+        bases.clear();
+        return;
+    }
+    // 处理右边
+    int right = (int)bases.size() - 1;
+    if (isNegativeStrand) {
+        for (; right >= 0; --right) {
+            if (quals[readLen - 1 - right] <= lowQTail)
+                bases[right] = 'N';
+            else
+                break;
+        }
+    } else {
+        for (; right >= 0; --right) {
+            if (quals[right] <= lowQTail)
+                bases[right] = 'N';
+            else
+                break;
+        }
+    }
+    if (right < left)
+        bases.clear();
+
+}
+
 // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
 void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
    uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
@ -106,28 +151,7 @@ void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string&
        for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
    }

-    // 处理左边
-    int left = 0;
-    for (; left < ad.read_len; ++left) {
-        if (quals[left] <= lowQTail)
-            clippedBases[left] = 'N';
-        else
-            break;
-    }
-    if (left == ad.read_len) {
-        clippedBases.clear();
-        return;
-    }
-    // 处理右边
-    int right = ad.read_len - 1;
-    for (; right >= 0; --right) {
-        if (quals[right] <= lowQTail)
-            clippedBases[right] = 'N';
-        else
-            break;
-    }
-    if (right < left)
-        clippedBases.clear();
+    ClipLowQualEndsWithN(clippedBases, ad.base_quals, lowQTail, bw->GetReadNegativeStrandFlag());
 }

 /**
@ -224,7 +248,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header,

    // store the original bases and then write Ns over low quality ones
    string strandedClippedBases(ad.bases);
-    GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
+    GetStrandedClippedBytes(bw, ad, strandedClippedBases, 30); // 注意这里的lowQualTail数值
    // spdlog::info("bases: {}", strandedClippedBases);
    vector<int> nBasePairContextAtEachCycle;
    GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
--- a/src/bqsr/qual_utils.h
+++ b/src/bqsr/qual_utils.h
@ -225,6 +225,8 @@ struct QualityUtils {
    static uint8_t errorProbToQual(const double errorRate, const uint8_t maxQual) {
        // Utils.validateArg(MathUtils.isValidProbability(errorRate), ()->"errorRate must be good probability but got " + errorRate);
        const double d = std::round(-10.0 * std::log10(errorRate));
+        if (d > maxQual) // 注意当d超过int表示的最大值时(如inf)，转变成int会变成负数
+            return maxQual;
        return boundQual((int)d, maxQual);
    }

--- a/src/bqsr/recal_utils.h
+++ b/src/bqsr/recal_utils.h
@ -113,13 +113,13 @@ struct RecalUtils {
        table.addColumn({"Observations", "%d"});
        table.addColumn({"Errors", "%.2f"});

-        spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches,
-                     r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality);
+        //spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches,
+        //             r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality);
                     
        _Foreach2DK(r.readGroupTable, datum, {
            RecalDatum &dat = const_cast<RecalDatum&>(datum);
-            spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches());
-            spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations());
+            // spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches());
+            // spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations());
            if (dat.getNumObservations() > 0) {
                table.addRowData({
                    ReadGroupCovariate::IdToRg[k1],
--- a/src/util/bam_wrap.h
+++ b/src/util/bam_wrap.h
@ -31,7 +31,7 @@ struct Cigar {
 };

 struct ReadIdxCigar {
-    int readIdx = 0; // 在read序列中的位置
+    int readIdx = -1; // 在read序列中的位置
    char cigarOp = '0'; // 当前位置对应的cigar
 };

@ -57,6 +57,7 @@ struct FastArray {
        }
    }
    inline T& operator[](size_t pos) { return arr[pos]; }
+    inline const T& operator[](size_t pos) const { return arr[pos]; }
    struct iterator {
        typename std::vector<T>::iterator it;
        iterator(typename std::vector<T>::iterator _it) : it(_it) {}
@ -74,6 +75,7 @@ struct FastArray {
 // 对原始bam数据的补充，比如对两端进行hardclip等
 class BamWrap;
 struct SamData {
+    int64_t rid = 0; // for debug
    int read_len = 0;          // read长度，各种clip之后的长度
    int cigar_start = 0;       // cigar起始位置，闭区间
    int cigar_end = 0;         // cigar结束位置，开区间
--- a/src/util/debug.cpp
+++ b/src/util/debug.cpp
@ -0,0 +1,31 @@
+/*********************************************************************************************
+    Description: data and files for debugging
+    Copyright : All right reserved by NCIC.ICT
+
+    Author : Zhang Zhonghai
+    Date : 2024/04/09
+***********************************************************************************************/
+#include "debug.h"
+
+#include <stdio.h>
+
+FILE* gf[4] = {0};
+
+int open_debug_files() {
+    char fn[1024] = {0};
+    int i = 0;
+    for (; i < 4; ++i) {
+        sprintf(fn, "./test/fp%d.txt", i);
+        gf[i] = fopen(fn, "w");
+    }
+    return 0;
+}
+
+int close_debug_files() {
+    int i = 0;
+    for (; i < 4; ++i) {
+        if (gf[i] != 0)
+            fclose(gf[i]);
+    }
+    return 0;
+}
--- a/src/util/debug.h
+++ b/src/util/debug.h
@ -0,0 +1,27 @@
+/*********************************************************************************************
+    Description: data and files for debugging
+    Copyright : All right reserved by NCIC.ICT
+
+    Author : Zhang Zhonghai
+    Date : 2024/04/09
+***********************************************************************************************/
+#include <stdio.h>
+#include <stdint.h>
+
+////////////////// for debug and test //////////////////////////
+
+#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件，并记录debug信息
+// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度，以及最终扩展的长度
+// #define GET_FULL_MATCH_READ // 获取完全匹配的reads
+// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
+// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query，target，和其他用于计算的数据
+// #define GET_KSW_ALIGN_SEQ
+// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
+
+////////////////////////////////////////////////////////////////
+
+extern FILE *gf[4];
+
+int open_debug_files();
+
+int close_debug_files();
--- a/test/stats.table
+++ b/test/stats.table
Author	SHA1	Message	Date
zzh	6662435948	上一个commit为啥没有包含所有更改文件	2025-12-28 23:06:03 +08:00
zzh	8e3388a494	在小数据集上结果与gatk一致（版本gatk-package-4.6.2.0-17）	2025-12-28 23:04:45 +08:00