/* Description: 在bqsr过程中,计算协变量相关的类和方法 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2025/12/08 */ #pragma once #include #include #include #include #include #include #include "bqsr_args.h" #include "util/bam_wrap.h" using std::map; using std::string; using std::vector; /** * This is where we store the per-read covariates, also indexed by (event type) and (read position). * Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }. * For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle). */ typedef vector>> PerReadCovariateMatrix; // 变异类型(snp, insert, deletion) struct EventTypeValue { int index; // 在协变量数组中对应的索引 char representation; string longRepresentation; bool operator==(const EventTypeValue& a) const { return a.index == index; } }; struct EventType { static constexpr int EVENT_SIZE = 3; static EventTypeValue BASE_SUBSTITUTION; static EventTypeValue BASE_INSERTION; static EventTypeValue BASE_DELETION; static vector EVENTS; }; // 协变量相关的工具类 struct CovariateUtils { static constexpr int MAX_READ_LENGTH = 300; // 最大read长度 static constexpr int NUM_COVARIATES = 4; // 初始化PerReadCovariateMatrix static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) { matrix.resize(EventType::EVENT_SIZE); for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) { matrix[event_type].resize(MAX_READ_LENGTH); for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) { matrix[event_type][pos].resize(NUM_COVARIATES, 0); } } } // 设置协变量 static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) { matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch; matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion; matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion; } // 对一条read计算协变量(该协变量被上一个read用过) static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Read group协变量 struct ReadGroupCovariate { static constexpr int index = 0; // 在协变量数组中的索引位置 static map RgToId; // read group name到id的映射 static map IdToRg; // id到read group name的映射 static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Base quality协变量 struct BaseQualityCovariate { static constexpr int index = 1; // 在协变量数组中的索引位置 static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Context协变量 struct ContextCovariate { static constexpr int index = 2; // 在协变量数组中的索引位置 static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1; static constexpr int LENGTH_BITS = 4; static constexpr int LENGTH_MASK = 15; // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. static constexpr int MAX_DNA_CONTEXT = 13; static int mismatchesContextSize; static int indelsContextSize; static int mismatchesKeyMask; static int indelsKeyMask; static uint8_t lowQualTail; static int baseIndexMap[256]; static void InitContextCovariate(BQSRArg& p) { mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE; indelsContextSize = p.INDELS_CONTEXT_SIZE; if (mismatchesContextSize > MAX_DNA_CONTEXT) { spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize); exit(1); } if (indelsContextSize > MAX_DNA_CONTEXT) { spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize); exit(1); } lowQualTail = p.LOW_QUAL_TAIL; if (mismatchesContextSize <= 0 || indelsContextSize <= 0) { spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize); exit(1); } mismatchesKeyMask = CreateMask(mismatchesContextSize); indelsKeyMask = CreateMask(indelsContextSize); // init baseIndexMap for (int i = 0; i < 256; ++i) { baseIndexMap[i] = -1; } baseIndexMap['A'] = 0; baseIndexMap['a'] = 0; baseIndexMap['*'] = 0; baseIndexMap['C'] = 1; baseIndexMap['c'] = 1; baseIndexMap['G'] = 2; baseIndexMap['g'] = 2; baseIndexMap['T'] = 3; baseIndexMap['t'] = 3; } static int MaximumKeyValue() { int length = max(mismatchesContextSize, indelsContextSize); int key = length; int bitOffset = LENGTH_BITS; for (int i = 0; i < length; ++i) { key |= (3 << bitOffset); bitOffset += 2; } return key; } static int CreateMask(int contextSize) { int mask = 0; // create 2*contextSize worth of bits for (int i = 0; i < contextSize; i++) { mask = (mask << 2) | 3; } // shift 4 bits to mask out the bits used to encode the length return mask << LENGTH_BITS; } /** * Helper method: computes the correct offset to use in computations of covariate values. * @param isNegativeStrand is the read on the negative strand * @param offset 0-based index of the base in the read * @param readLength length of the read * @return */ static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) { return isNegativeStrand ? (readLength - offset - 1) : offset; } static char baseIndexToSimpleBase(const int baseIndex) { switch (baseIndex) { case 0: return 'A'; case 1: return 'C'; case 2: return 'G'; case 3: return 'T'; default: return '.'; } } /** * Converts a key into the dna string representation. * * @param key the key representing the dna sequence * @return the dna sequence represented by the key */ static string ContextFromKey(const int key) { int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context int mask = 48; // use the mask to pull out bases int offset = LENGTH_BITS; string dna; for (int i = 0; i < length; i++) { int baseIndex = (key & mask) >> offset; dna.push_back(baseIndexToSimpleBase(baseIndex)); mask <<= 2; // move the mask over to the next 2 bits offset += 2; } return dna; } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) static void GetStrandedClippedBytes(SamData& ad, string& clippedBases, uint8_t lowQTail); // Creates a int representation of a given dna string. static int KeyFromContext(const string& dna, const int start, const int end); // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector& keys); // 设置协变量的值 static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Cycle协变量 struct CycleCovariate { static constexpr int index = 3; // 在协变量数组中的索引位置 static int MAXIMUM_CYCLE_VALUE; static constexpr int CUSHION_FOR_INDELS = 4; static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; } static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; } /** * Encodes the cycle number as a key. */ static int KeyFromCycle(const int cycle, const int maxCycle) { // no negative values because values must fit into the first few bits of the long int result = std::abs(cycle); if (result > maxCycle) { spdlog::error( "The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument " "(when creating the recalibration table in " "BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)", maxCycle, result); exit(1); } result <<= 1; // shift so we can add the "sign" bit if (cycle < 0) { result++; // negative cycles get the lower-most bit set } return result; } /** * Decodes the cycle number from the key. */ static int CycleFromKey(const int key) { int cycle = key >> 1; // shift so we can remove the "sign" bit if ((key & 1) != 0) { // is the last bit set? cycle *= -1; // then the cycle is negative } return cycle; } // Computes the encoded value of CycleCovariate's key for the given position at the read. static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle); static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // 好像不需要 struct StandardCovariateList { ReadGroupCovariate readGroupCovariate; BaseQualityCovariate qualityScoreCovariate; };