FastBQSR/src/bqsr/covariate.h

276 lines
10 KiB
C
Raw Normal View History

/*
Description: bqsr
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/08
*/
#pragma once
#include <spdlog/spdlog.h>
#include <cstdint>
#include <cstdlib>
#include <map>
#include <string>
#include <vector>
#include "bqsr_args.h"
#include "util/bam_wrap.h"
using std::map;
using std::string;
using std::vector;
/**
* This is where we store the per-read covariates, also indexed by (event type) and (read position).
* Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
* For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
*/
typedef vector<vector<vector<int>>> PerReadCovariateMatrix;
// 变异类型snp, insert, deletion
struct EventTypeValue {
int index; // 在协变量数组中对应的索引
char representation;
string longRepresentation;
bool operator==(const EventTypeValue& a) const { return a.index == index; }
};
struct EventType {
static constexpr int EVENT_SIZE = 3;
static EventTypeValue BASE_SUBSTITUTION;
static EventTypeValue BASE_INSERTION;
static EventTypeValue BASE_DELETION;
static vector<EventTypeValue> EVENTS;
};
// 协变量相关的工具类
struct CovariateUtils {
static constexpr int MAX_READ_LENGTH = 300; // 最大read长度
static constexpr int NUM_COVARIATES = 4;
// 初始化PerReadCovariateMatrix
static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
matrix.resize(EventType::EVENT_SIZE);
for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
matrix[event_type].resize(MAX_READ_LENGTH);
for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) {
matrix[event_type][pos].resize(NUM_COVARIATES, 0);
}
}
}
// 设置协变量
static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) {
matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch;
matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion;
matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion;
}
// 对一条read计算协变量该协变量被上一个read用过
static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Read group协变量
struct ReadGroupCovariate {
static constexpr int index = 0; // 在协变量数组中的索引位置
static map<string, int> RgToId; // read group name到id的映射
static map<int, string> IdToRg; // id到read group name的映射
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Base quality协变量
struct BaseQualityCovariate {
static constexpr int index = 1; // 在协变量数组中的索引位置
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Context协变量
struct ContextCovariate {
static constexpr int index = 2; // 在协变量数组中的索引位置
static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
static constexpr int LENGTH_BITS = 4;
static constexpr int LENGTH_MASK = 15;
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
static constexpr int MAX_DNA_CONTEXT = 13;
static int mismatchesContextSize;
static int indelsContextSize;
static int mismatchesKeyMask;
static int indelsKeyMask;
static uint8_t lowQualTail;
static int baseIndexMap[256];
static void InitContextCovariate(BQSRArg& p) {
mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
indelsContextSize = p.INDELS_CONTEXT_SIZE;
if (mismatchesContextSize > MAX_DNA_CONTEXT) {
spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
exit(1);
}
if (indelsContextSize > MAX_DNA_CONTEXT) {
spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
exit(1);
}
lowQualTail = p.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
exit(1);
}
mismatchesKeyMask = CreateMask(mismatchesContextSize);
indelsKeyMask = CreateMask(indelsContextSize);
// init baseIndexMap
for (int i = 0; i < 256; ++i) {
baseIndexMap[i] = -1;
}
baseIndexMap['A'] = 0;
baseIndexMap['a'] = 0;
baseIndexMap['*'] = 0;
baseIndexMap['C'] = 1;
baseIndexMap['c'] = 1;
baseIndexMap['G'] = 2;
baseIndexMap['g'] = 2;
baseIndexMap['T'] = 3;
baseIndexMap['t'] = 3;
}
static int MaximumKeyValue() {
int length = max(mismatchesContextSize, indelsContextSize);
int key = length;
int bitOffset = LENGTH_BITS;
for (int i = 0; i < length; ++i) {
key |= (3 << bitOffset);
bitOffset += 2;
}
return key;
}
static int CreateMask(int contextSize) {
int mask = 0;
// create 2*contextSize worth of bits
for (int i = 0; i < contextSize; i++) {
mask = (mask << 2) | 3;
}
// shift 4 bits to mask out the bits used to encode the length
return mask << LENGTH_BITS;
}
/**
* Helper method: computes the correct offset to use in computations of covariate values.
* @param isNegativeStrand is the read on the negative strand
* @param offset 0-based index of the base in the read
* @param readLength length of the read
* @return
*/
static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
return isNegativeStrand ? (readLength - offset - 1) : offset;
}
static char baseIndexToSimpleBase(const int baseIndex) {
switch (baseIndex) {
case 0:
return 'A';
case 1:
return 'C';
case 2:
return 'G';
case 3:
return 'T';
default:
return '.';
}
}
/**
* Converts a key into the dna string representation.
*
* @param key the key representing the dna sequence
* @return the dna sequence represented by the key
*/
static string ContextFromKey(const int key) {
int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context
int mask = 48; // use the mask to pull out bases
int offset = LENGTH_BITS;
string dna;
for (int i = 0; i < length; i++) {
int baseIndex = (key & mask) >> offset;
dna.push_back(baseIndexToSimpleBase(baseIndex));
mask <<= 2; // move the mask over to the next 2 bits
offset += 2;
}
return dna;
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
static void GetStrandedClippedBytes(SamData& ad, string& clippedBases, uint8_t lowQTail);
// Creates a int representation of a given dna string.
static int KeyFromContext(const string& dna, const int start, const int end);
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
// 设置协变量的值
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Cycle协变量
struct CycleCovariate {
static constexpr int index = 3; // 在协变量数组中的索引位置
static int MAXIMUM_CYCLE_VALUE;
static constexpr int CUSHION_FOR_INDELS = 4;
static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
/**
* Encodes the cycle number as a key.
*/
static int KeyFromCycle(const int cycle, const int maxCycle) {
// no negative values because values must fit into the first few bits of the long
int result = std::abs(cycle);
if (result > maxCycle) {
spdlog::error(
"The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument "
"(when creating the recalibration table in "
"BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
maxCycle, result);
exit(1);
}
result <<= 1; // shift so we can add the "sign" bit
if (cycle < 0) {
result++; // negative cycles get the lower-most bit set
}
return result;
}
/**
* Decodes the cycle number from the key.
*/
static int CycleFromKey(const int key) {
int cycle = key >> 1; // shift so we can remove the "sign" bit
if ((key & 1) != 0) { // is the last bit set?
cycle *= -1; // then the cycle is negative
}
return cycle;
}
// Computes the encoded value of CycleCovariate's key for the given position at the read.
static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// 好像不需要
struct StandardCovariateList {
ReadGroupCovariate readGroupCovariate;
BaseQualityCovariate qualityScoreCovariate;
};