/* Description: 在bqsr过程中,计算协变量相关的类和方法 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2025/12/08 */ #pragma once #include #include #include #include #include #include #include "bqsr_args.h" #include "util/bam_wrap.h" #include "util/sam_data.h" using std::map; using std::string; using std::vector; // 协变量的值, 4个协变量 struct CovariateValues { int readGroup = 0; int baseQuality = 0; int context = -1; int cycle = -1; void clear() { readGroup = 0; baseQuality = 0; context = -1; cycle = -1; } }; /** * This is where we store the per-read covariates, also indexed by (event type) and (read position). * Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }. * For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle). */ // 三维数组,第一维是event type,第二维是read position,第三维是协变量数组(结构体)(base quality, context, cycle) typedef vector> PerReadCovariateMatrix; // 变异类型(snp, insert, deletion) struct EventTypeValue { int index; // 在协变量数组中对应的索引 char representation; string longRepresentation; bool operator==(const EventTypeValue& a) const { return a.index == index; } }; struct EventType { static constexpr int EVENT_SIZE = 3; static EventTypeValue BASE_SUBSTITUTION; static EventTypeValue BASE_INSERTION; static EventTypeValue BASE_DELETION; static vector EVENTS; }; // Read group协变量 struct ReadGroupCovariate { static constexpr int index = 0; // 在协变量数组中的索引位置 static map RgToId; // read group name到id的映射 static map IdToRg; // id到read group name的映射 static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Base quality协变量 struct BaseQualityCovariate { static constexpr int index = 1; // 在协变量数组中的索引位置 static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Context协变量 struct ContextCovariate { static constexpr int index = 2; // 在协变量数组中的索引位置 static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1; static constexpr int LENGTH_BITS = 4; static constexpr int LENGTH_MASK = 15; // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. static constexpr int MAX_DNA_CONTEXT = 13; static int mismatchesContextSize; static int indelsContextSize; static int mismatchesKeyMask; static int indelsKeyMask; static uint8_t lowQualTail; static int baseIndexMap[256]; static void InitContextCovariate(BQSRArg& p) { mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE; indelsContextSize = p.INDELS_CONTEXT_SIZE; if (mismatchesContextSize > MAX_DNA_CONTEXT) { spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize); exit(1); } if (indelsContextSize > MAX_DNA_CONTEXT) { spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize); exit(1); } lowQualTail = p.LOW_QUAL_TAIL; if (mismatchesContextSize <= 0 || indelsContextSize <= 0) { spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize); exit(1); } mismatchesKeyMask = CreateMask(mismatchesContextSize); indelsKeyMask = CreateMask(indelsContextSize); // init baseIndexMap for (int i = 0; i < 256; ++i) { baseIndexMap[i] = -1; } baseIndexMap['A'] = 0; baseIndexMap['a'] = 0; baseIndexMap['*'] = 0; baseIndexMap['C'] = 1; baseIndexMap['c'] = 1; baseIndexMap['G'] = 2; baseIndexMap['g'] = 2; baseIndexMap['T'] = 3; baseIndexMap['t'] = 3; } static int MaximumKeyValue() { int length = max(mismatchesContextSize, indelsContextSize); int key = length; int bitOffset = LENGTH_BITS; for (int i = 0; i < length; ++i) { key |= (3 << bitOffset); bitOffset += 2; } return key; } static int CreateMask(int contextSize) { int mask = 0; // create 2*contextSize worth of bits for (int i = 0; i < contextSize; i++) { mask = (mask << 2) | 3; } // shift 4 bits to mask out the bits used to encode the length return mask << LENGTH_BITS; } /** * Helper method: computes the correct offset to use in computations of covariate values. * @param isNegativeStrand is the read on the negative strand * @param offset 0-based index of the base in the read * @param readLength length of the read * @return */ static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) { return isNegativeStrand ? (readLength - offset - 1) : offset; } static char baseIndexToSimpleBase(const int baseIndex) { switch (baseIndex) { case 0: return 'A'; case 1: return 'C'; case 2: return 'G'; case 3: return 'T'; default: return '.'; } } /** * Converts a key into the dna string representation. * * @param key the key representing the dna sequence * @return the dna sequence represented by the key */ static string ContextFromKey(const int key) { int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context int mask = 48; // use the mask to pull out bases int offset = LENGTH_BITS; string dna; for (int i = 0; i < length; i++) { int baseIndex = (key & mask) >> offset; dna.push_back(baseIndexToSimpleBase(baseIndex)); mask <<= 2; // move the mask over to the next 2 bits offset += 2; } return dna; } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) static void GetStrandedClippedBytes(SamData& ad, StableArray& clippedBases, uint8_t lowQTail); // Creates a int representation of a given dna string. static int KeyFromContext(const StableArray& dna, const int start, const int end); // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). static void GetReadContextAtEachPosition(const StableArray& bases, const int contextSize, const int mask, StableArray& keys); // 设置协变量的值 static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Cycle协变量 struct CycleCovariate { static constexpr int index = 3; // 在协变量数组中的索引位置 static int MAXIMUM_CYCLE_VALUE; static constexpr int CUSHION_FOR_INDELS = 4; static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; } static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; } /** * Encodes the cycle number as a key. */ static int KeyFromCycle(const int cycle, const int maxCycle) { // no negative values because values must fit into the first few bits of the long int result = std::abs(cycle); if (result > maxCycle) { spdlog::error( "The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument " "(when creating the recalibration table in " "BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)", maxCycle, result); exit(1); } result <<= 1; // shift so we can add the "sign" bit if (cycle < 0) { result++; // negative cycles get the lower-most bit set } return result; } /** * Decodes the cycle number from the key. */ static int CycleFromKey(const int key) { int cycle = key >> 1; // shift so we can remove the "sign" bit if ((key & 1) != 0) { // is the last bit set? cycle *= -1; // then the cycle is negative } return cycle; } // Computes the encoded value of CycleCovariate's key for the given position at the read. static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle); static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // 协变量相关的工具类 struct CovariateUtils { static constexpr int MAX_READ_LENGTH = 300; // 最大read长度 static constexpr int NUM_COVARIATES = 4; // 初始化PerReadCovariateMatrix static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) { matrix.resize(EventType::EVENT_SIZE); for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) { matrix[event_type].resize(MAX_READ_LENGTH); } } // 设置协变量 static inline void SetReadGroup(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) { matrix[EventType::BASE_SUBSTITUTION.index][readOffset].readGroup = mismatch; matrix[EventType::BASE_INSERTION.index][readOffset].readGroup = insertion; matrix[EventType::BASE_DELETION.index][readOffset].readGroup = deletion; } static inline void SetBaseQual(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) { matrix[EventType::BASE_SUBSTITUTION.index][readOffset].baseQuality = mismatch; matrix[EventType::BASE_INSERTION.index][readOffset].baseQuality = insertion; matrix[EventType::BASE_DELETION.index][readOffset].baseQuality = deletion; } static inline void SetContext(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) { matrix[EventType::BASE_SUBSTITUTION.index][readOffset].context = mismatch; matrix[EventType::BASE_INSERTION.index][readOffset].context = insertion; matrix[EventType::BASE_DELETION.index][readOffset].context = deletion; } static inline void SetCycle(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) { matrix[EventType::BASE_SUBSTITUTION.index][readOffset].cycle = mismatch; matrix[EventType::BASE_INSERTION.index][readOffset].cycle = insertion; matrix[EventType::BASE_DELETION.index][readOffset].cycle = deletion; } static inline void SetAllCovs(int m1, int i1, int d1, int m2, int i2, int d2, int m3, int i3, int d3, int m4, int i4, int d4, int readOffset, PerReadCovariateMatrix& matrix) { CovariateValues *cvp = &matrix[EventType::BASE_SUBSTITUTION.index][readOffset]; cvp->readGroup = m1; cvp->baseQuality = m2; cvp->context = m3; cvp->cycle = m4; cvp = &matrix[EventType::BASE_INSERTION.index][readOffset]; cvp->readGroup = i1; cvp->baseQuality = i2; cvp->context = i3; cvp->cycle = i4; cvp = &matrix[EventType::BASE_DELETION.index][readOffset]; cvp->readGroup = d1; cvp->baseQuality = d2; cvp->context = d3; cvp->cycle = d4; } // 对一条read计算协变量(该协变量被上一个read用过) static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues, int thid); };