FastBQSR/src/bqsr/covariate.h

/*
     Description: 在bqsr过程中，计算协变量相关的类和方法

     Copyright : All right reserved by ICT

     Author : Zhang Zhonghai
     Date : 2025/12/08
*/

#pragma once

#include <spdlog/spdlog.h>

#include <cstdint>
#include <cstdlib>
#include <map>
#include <string>
#include <vector>

#include "bqsr_args.h"
#include "util/bam_wrap.h"
#include "util/sam_data.h"

using std::map;
using std::string;
using std::vector;

// 协变量的值, 4个协变量
struct CovariateValues {
    int readGroup = 0;
    int baseQuality = 0;
    int context = -1;
    int cycle = -1;
    void clear() {
        readGroup = 0;
        baseQuality = 0;
        context = -1;
        cycle = -1;
    }
};

/**
 * This is where we store the per-read covariates, also indexed by (event type) and (read position).
 * Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
 * For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
 */
// 三维数组，第一维是event type，第二维是read position，第三维是协变量数组(结构体)(base quality, context, cycle)
typedef vector<vector<CovariateValues>> PerReadCovariateMatrix;

// 变异类型（snp, insert, deletion）
struct EventTypeValue {
    int index; // 在协变量数组中对应的索引
    char representation;
    string longRepresentation;
    bool operator==(const EventTypeValue& a) const { return a.index == index; }
};

struct EventType {
    static constexpr int EVENT_SIZE = 3;
    static EventTypeValue BASE_SUBSTITUTION;
    static EventTypeValue BASE_INSERTION;
    static EventTypeValue BASE_DELETION;
    static vector<EventTypeValue> EVENTS;
};

// Read group协变量
struct ReadGroupCovariate {
    static constexpr int index = 0;      // 在协变量数组中的索引位置
    static map<string, int> RgToId;  // read group name到id的映射
    static map<int, string> IdToRg;  // id到read group name的映射

    static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};

// Base quality协变量
struct BaseQualityCovariate {
    static constexpr int index = 1;  // 在协变量数组中的索引位置
    static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};

// Context协变量
struct ContextCovariate {
    static constexpr int index = 2;  // 在协变量数组中的索引位置
    static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
    static constexpr int LENGTH_BITS = 4;
    static constexpr int LENGTH_MASK = 15;

    // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
    // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
    static constexpr int MAX_DNA_CONTEXT = 13;

    static int mismatchesContextSize;
    static int indelsContextSize;
    static int mismatchesKeyMask;
    static int indelsKeyMask;
    static uint8_t lowQualTail;

    static int baseIndexMap[256];

    static void InitContextCovariate(BQSRArg& p) {
        mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
        indelsContextSize = p.INDELS_CONTEXT_SIZE;
        if (mismatchesContextSize > MAX_DNA_CONTEXT) {
            spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
            exit(1);
        }
        if (indelsContextSize > MAX_DNA_CONTEXT) {
            spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
            exit(1);
        }
        lowQualTail = p.LOW_QUAL_TAIL;
        if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
            spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
            exit(1);
        }
        mismatchesKeyMask = CreateMask(mismatchesContextSize);
        indelsKeyMask = CreateMask(indelsContextSize);

        // init baseIndexMap
        for (int i = 0; i < 256; ++i) {
            baseIndexMap[i] = -1;
        }
        baseIndexMap['A'] = 0;
        baseIndexMap['a'] = 0;
        baseIndexMap['*'] = 0;
        baseIndexMap['C'] = 1;
        baseIndexMap['c'] = 1;
        baseIndexMap['G'] = 2;
        baseIndexMap['g'] = 2;
        baseIndexMap['T'] = 3;
        baseIndexMap['t'] = 3;
    }

    static int MaximumKeyValue() {
        int length = max(mismatchesContextSize, indelsContextSize);
        int key = length;
        int bitOffset = LENGTH_BITS;
        for (int i = 0; i < length; ++i) {
            key |= (3 << bitOffset);
            bitOffset += 2;
        }
        return key;
    }

    static int CreateMask(int contextSize) {
        int mask = 0;
        // create 2*contextSize worth of bits
        for (int i = 0; i < contextSize; i++) {
            mask = (mask << 2) | 3;
        }
        // shift 4 bits to mask out the bits used to encode the length
        return mask << LENGTH_BITS;
    }

    /**
     * Helper method: computes the correct offset to use in computations of covariate values.
     * @param isNegativeStrand is the read on the negative strand
     * @param offset 0-based index of the base in the read
     * @param readLength length of the read
     * @return
     */
    static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
        return isNegativeStrand ? (readLength - offset - 1) : offset;
    }

    static char baseIndexToSimpleBase(const int baseIndex) {
        switch (baseIndex) {
        case 0:
            return 'A';
        case 1:
            return 'C';
        case 2:
            return 'G';
        case 3:
            return 'T';
        default:
            return '.';
        }
    }
    /**
     * Converts a key into the dna string representation.
     *
     * @param key    the key representing the dna sequence
     * @return the dna sequence represented by the key
     */
    static string ContextFromKey(const int key) {
        int length = key & LENGTH_MASK;  // the first bits represent the length (in bp) of the context
        int mask = 48;                         // use the mask to pull out bases
        int offset = LENGTH_BITS;

        string dna;
        for (int i = 0; i < length; i++) {
            int baseIndex = (key & mask) >> offset;
            dna.push_back(baseIndexToSimpleBase(baseIndex));
            mask <<= 2;  // move the mask over to the next 2 bits
            offset += 2;
        }
        return dna;
    }

    // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
    static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
    // Creates a int representation of a given dna string.
    static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
    // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
    static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);

    // 设置协变量的值
    static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};

// Cycle协变量
struct CycleCovariate {
    static constexpr int index = 3;  // 在协变量数组中的索引位置
    static int MAXIMUM_CYCLE_VALUE;
    static constexpr int CUSHION_FOR_INDELS = 4;

    static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }

    static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }

    /**
     * Encodes the cycle number as a key.
     */
    static int KeyFromCycle(const int cycle, const int maxCycle) {
        // no negative values because values must fit into the first few bits of the long
        int result = std::abs(cycle);
        if (result > maxCycle) {
            spdlog::error(
                "The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected.  Please use the --maximum-cycle-value argument "
                "(when creating the recalibration table in "
                "BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
                maxCycle, result);
            exit(1);
        }

        result <<= 1;  // shift so we can add the "sign" bit
        if (cycle < 0) {
            result++;  // negative cycles get the lower-most bit set
        }
        return result;
    }

    /**
     * Decodes the cycle number from the key.
     */
    static int CycleFromKey(const int key) {
        int cycle = key >> 1;  // shift so we can remove the "sign" bit
        if ((key & 1) != 0) {  // is the last bit set?
            cycle *= -1;       // then the cycle is negative
        }
        return cycle;
    }

    // Computes the encoded value of CycleCovariate's key for the given position at the read.
    static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle);

    static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};

// 协变量相关的工具类
struct CovariateUtils {
    static constexpr int MAX_READ_LENGTH = 300;  // 最大read长度
    static constexpr int NUM_COVARIATES = 4;

    // 初始化PerReadCovariateMatrix
    static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
        matrix.resize(EventType::EVENT_SIZE);
        for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
            matrix[event_type].resize(MAX_READ_LENGTH);
        }
    }

    // 设置协变量
    static inline void SetReadGroup(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
        matrix[EventType::BASE_SUBSTITUTION.index][readOffset].readGroup = mismatch;
        matrix[EventType::BASE_INSERTION.index][readOffset].readGroup = insertion;
        matrix[EventType::BASE_DELETION.index][readOffset].readGroup = deletion;
    }

    static inline void SetBaseQual(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
        matrix[EventType::BASE_SUBSTITUTION.index][readOffset].baseQuality = mismatch;
        matrix[EventType::BASE_INSERTION.index][readOffset].baseQuality = insertion;
        matrix[EventType::BASE_DELETION.index][readOffset].baseQuality = deletion;
    }

    static inline void SetContext(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
        matrix[EventType::BASE_SUBSTITUTION.index][readOffset].context = mismatch;
        matrix[EventType::BASE_INSERTION.index][readOffset].context = insertion;
        matrix[EventType::BASE_DELETION.index][readOffset].context = deletion;
    }

    static inline void SetCycle(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
        matrix[EventType::BASE_SUBSTITUTION.index][readOffset].cycle = mismatch;
        matrix[EventType::BASE_INSERTION.index][readOffset].cycle = insertion;
        matrix[EventType::BASE_DELETION.index][readOffset].cycle = deletion;
    }

    static inline void SetAllCovs(int m1, int i1, int d1, int m2, int i2, int d2, int m3, int i3, int d3, int m4, int i4, int d4, int readOffset,
                                  PerReadCovariateMatrix& matrix) {
        CovariateValues *cvp = &matrix[EventType::BASE_SUBSTITUTION.index][readOffset];
        cvp->readGroup = m1;
        cvp->baseQuality = m2;
        cvp->context = m3;
        cvp->cycle = m4;
        cvp = &matrix[EventType::BASE_INSERTION.index][readOffset];
        cvp->readGroup = i1;
        cvp->baseQuality = i2;
        cvp->context = i3;
        cvp->cycle = i4;
        cvp = &matrix[EventType::BASE_DELETION.index][readOffset];
        cvp->readGroup = d1;
        cvp->baseQuality = d2;
        cvp->context = d3;
        cvp->cycle = d4;
    }

    // 对一条read计算协变量（该协变量被上一个read用过）
    static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues, int thid);
};