320 lines
12 KiB
C++
320 lines
12 KiB
C++
/*
|
||
Description: 在bqsr过程中,计算协变量相关的类和方法
|
||
|
||
Copyright : All right reserved by ICT
|
||
|
||
Author : Zhang Zhonghai
|
||
Date : 2025/12/08
|
||
*/
|
||
|
||
#pragma once
|
||
|
||
#include <spdlog/spdlog.h>
|
||
|
||
#include <cstdint>
|
||
#include <cstdlib>
|
||
#include <map>
|
||
#include <string>
|
||
#include <vector>
|
||
|
||
#include "bqsr_args.h"
|
||
#include "util/bam_wrap.h"
|
||
#include "util/sam_data.h"
|
||
|
||
using std::map;
|
||
using std::string;
|
||
using std::vector;
|
||
|
||
// 协变量的值, 4个协变量
|
||
struct CovariateValues {
|
||
int readGroup = 0;
|
||
int baseQuality = 0;
|
||
int context = -1;
|
||
int cycle = -1;
|
||
void clear() {
|
||
readGroup = 0;
|
||
baseQuality = 0;
|
||
context = -1;
|
||
cycle = -1;
|
||
}
|
||
};
|
||
|
||
/**
|
||
* This is where we store the per-read covariates, also indexed by (event type) and (read position).
|
||
* Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
|
||
* For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
|
||
*/
|
||
// 三维数组,第一维是event type,第二维是read position,第三维是协变量数组(结构体)(base quality, context, cycle)
|
||
typedef vector<vector<CovariateValues>> PerReadCovariateMatrix;
|
||
|
||
// 变异类型(snp, insert, deletion)
|
||
struct EventTypeValue {
|
||
int index; // 在协变量数组中对应的索引
|
||
char representation;
|
||
string longRepresentation;
|
||
bool operator==(const EventTypeValue& a) const { return a.index == index; }
|
||
};
|
||
|
||
struct EventType {
|
||
static constexpr int EVENT_SIZE = 3;
|
||
static EventTypeValue BASE_SUBSTITUTION;
|
||
static EventTypeValue BASE_INSERTION;
|
||
static EventTypeValue BASE_DELETION;
|
||
static vector<EventTypeValue> EVENTS;
|
||
};
|
||
|
||
// Read group协变量
|
||
struct ReadGroupCovariate {
|
||
static constexpr int index = 0; // 在协变量数组中的索引位置
|
||
static map<string, int> RgToId; // read group name到id的映射
|
||
static map<int, string> IdToRg; // id到read group name的映射
|
||
|
||
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||
};
|
||
|
||
// Base quality协变量
|
||
struct BaseQualityCovariate {
|
||
static constexpr int index = 1; // 在协变量数组中的索引位置
|
||
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||
};
|
||
|
||
// Context协变量
|
||
struct ContextCovariate {
|
||
static constexpr int index = 2; // 在协变量数组中的索引位置
|
||
static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
|
||
static constexpr int LENGTH_BITS = 4;
|
||
static constexpr int LENGTH_MASK = 15;
|
||
|
||
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
|
||
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
|
||
static constexpr int MAX_DNA_CONTEXT = 13;
|
||
|
||
static int mismatchesContextSize;
|
||
static int indelsContextSize;
|
||
static int mismatchesKeyMask;
|
||
static int indelsKeyMask;
|
||
static uint8_t lowQualTail;
|
||
|
||
static int baseIndexMap[256];
|
||
|
||
static void InitContextCovariate(BQSRArg& p) {
|
||
mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
|
||
indelsContextSize = p.INDELS_CONTEXT_SIZE;
|
||
if (mismatchesContextSize > MAX_DNA_CONTEXT) {
|
||
spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
|
||
exit(1);
|
||
}
|
||
if (indelsContextSize > MAX_DNA_CONTEXT) {
|
||
spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
|
||
exit(1);
|
||
}
|
||
lowQualTail = p.LOW_QUAL_TAIL;
|
||
if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
|
||
spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
|
||
exit(1);
|
||
}
|
||
mismatchesKeyMask = CreateMask(mismatchesContextSize);
|
||
indelsKeyMask = CreateMask(indelsContextSize);
|
||
|
||
// init baseIndexMap
|
||
for (int i = 0; i < 256; ++i) {
|
||
baseIndexMap[i] = -1;
|
||
}
|
||
baseIndexMap['A'] = 0;
|
||
baseIndexMap['a'] = 0;
|
||
baseIndexMap['*'] = 0;
|
||
baseIndexMap['C'] = 1;
|
||
baseIndexMap['c'] = 1;
|
||
baseIndexMap['G'] = 2;
|
||
baseIndexMap['g'] = 2;
|
||
baseIndexMap['T'] = 3;
|
||
baseIndexMap['t'] = 3;
|
||
}
|
||
|
||
static int MaximumKeyValue() {
|
||
int length = max(mismatchesContextSize, indelsContextSize);
|
||
int key = length;
|
||
int bitOffset = LENGTH_BITS;
|
||
for (int i = 0; i < length; ++i) {
|
||
key |= (3 << bitOffset);
|
||
bitOffset += 2;
|
||
}
|
||
return key;
|
||
}
|
||
|
||
static int CreateMask(int contextSize) {
|
||
int mask = 0;
|
||
// create 2*contextSize worth of bits
|
||
for (int i = 0; i < contextSize; i++) {
|
||
mask = (mask << 2) | 3;
|
||
}
|
||
// shift 4 bits to mask out the bits used to encode the length
|
||
return mask << LENGTH_BITS;
|
||
}
|
||
|
||
/**
|
||
* Helper method: computes the correct offset to use in computations of covariate values.
|
||
* @param isNegativeStrand is the read on the negative strand
|
||
* @param offset 0-based index of the base in the read
|
||
* @param readLength length of the read
|
||
* @return
|
||
*/
|
||
static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
|
||
return isNegativeStrand ? (readLength - offset - 1) : offset;
|
||
}
|
||
|
||
static char baseIndexToSimpleBase(const int baseIndex) {
|
||
switch (baseIndex) {
|
||
case 0:
|
||
return 'A';
|
||
case 1:
|
||
return 'C';
|
||
case 2:
|
||
return 'G';
|
||
case 3:
|
||
return 'T';
|
||
default:
|
||
return '.';
|
||
}
|
||
}
|
||
/**
|
||
* Converts a key into the dna string representation.
|
||
*
|
||
* @param key the key representing the dna sequence
|
||
* @return the dna sequence represented by the key
|
||
*/
|
||
static string ContextFromKey(const int key) {
|
||
int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context
|
||
int mask = 48; // use the mask to pull out bases
|
||
int offset = LENGTH_BITS;
|
||
|
||
string dna;
|
||
for (int i = 0; i < length; i++) {
|
||
int baseIndex = (key & mask) >> offset;
|
||
dna.push_back(baseIndexToSimpleBase(baseIndex));
|
||
mask <<= 2; // move the mask over to the next 2 bits
|
||
offset += 2;
|
||
}
|
||
return dna;
|
||
}
|
||
|
||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||
static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
|
||
// Creates a int representation of a given dna string.
|
||
static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
|
||
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||
static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);
|
||
|
||
// 设置协变量的值
|
||
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||
};
|
||
|
||
// Cycle协变量
|
||
struct CycleCovariate {
|
||
static constexpr int index = 3; // 在协变量数组中的索引位置
|
||
static int MAXIMUM_CYCLE_VALUE;
|
||
static constexpr int CUSHION_FOR_INDELS = 4;
|
||
|
||
static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
|
||
|
||
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
|
||
|
||
/**
|
||
* Encodes the cycle number as a key.
|
||
*/
|
||
static int KeyFromCycle(const int cycle, const int maxCycle) {
|
||
// no negative values because values must fit into the first few bits of the long
|
||
int result = std::abs(cycle);
|
||
if (result > maxCycle) {
|
||
spdlog::error(
|
||
"The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument "
|
||
"(when creating the recalibration table in "
|
||
"BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
|
||
maxCycle, result);
|
||
exit(1);
|
||
}
|
||
|
||
result <<= 1; // shift so we can add the "sign" bit
|
||
if (cycle < 0) {
|
||
result++; // negative cycles get the lower-most bit set
|
||
}
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Decodes the cycle number from the key.
|
||
*/
|
||
static int CycleFromKey(const int key) {
|
||
int cycle = key >> 1; // shift so we can remove the "sign" bit
|
||
if ((key & 1) != 0) { // is the last bit set?
|
||
cycle *= -1; // then the cycle is negative
|
||
}
|
||
return cycle;
|
||
}
|
||
|
||
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
||
static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
||
|
||
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||
};
|
||
|
||
// 协变量相关的工具类
|
||
struct CovariateUtils {
|
||
static constexpr int MAX_READ_LENGTH = 300; // 最大read长度
|
||
static constexpr int NUM_COVARIATES = 4;
|
||
|
||
// 初始化PerReadCovariateMatrix
|
||
static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
|
||
matrix.resize(EventType::EVENT_SIZE);
|
||
for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
|
||
matrix[event_type].resize(MAX_READ_LENGTH);
|
||
}
|
||
}
|
||
|
||
// 设置协变量
|
||
static inline void SetReadGroup(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
|
||
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].readGroup = mismatch;
|
||
matrix[EventType::BASE_INSERTION.index][readOffset].readGroup = insertion;
|
||
matrix[EventType::BASE_DELETION.index][readOffset].readGroup = deletion;
|
||
}
|
||
|
||
static inline void SetBaseQual(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
|
||
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].baseQuality = mismatch;
|
||
matrix[EventType::BASE_INSERTION.index][readOffset].baseQuality = insertion;
|
||
matrix[EventType::BASE_DELETION.index][readOffset].baseQuality = deletion;
|
||
}
|
||
|
||
static inline void SetContext(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
|
||
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].context = mismatch;
|
||
matrix[EventType::BASE_INSERTION.index][readOffset].context = insertion;
|
||
matrix[EventType::BASE_DELETION.index][readOffset].context = deletion;
|
||
}
|
||
|
||
static inline void SetCycle(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
|
||
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].cycle = mismatch;
|
||
matrix[EventType::BASE_INSERTION.index][readOffset].cycle = insertion;
|
||
matrix[EventType::BASE_DELETION.index][readOffset].cycle = deletion;
|
||
}
|
||
|
||
static inline void SetAllCovs(int m1, int i1, int d1, int m2, int i2, int d2, int m3, int i3, int d3, int m4, int i4, int d4, int readOffset,
|
||
PerReadCovariateMatrix& matrix) {
|
||
CovariateValues *cvp = &matrix[EventType::BASE_SUBSTITUTION.index][readOffset];
|
||
cvp->readGroup = m1;
|
||
cvp->baseQuality = m2;
|
||
cvp->context = m3;
|
||
cvp->cycle = m4;
|
||
cvp = &matrix[EventType::BASE_INSERTION.index][readOffset];
|
||
cvp->readGroup = i1;
|
||
cvp->baseQuality = i2;
|
||
cvp->context = i3;
|
||
cvp->cycle = i4;
|
||
cvp = &matrix[EventType::BASE_DELETION.index][readOffset];
|
||
cvp->readGroup = d1;
|
||
cvp->baseQuality = d2;
|
||
cvp->context = d3;
|
||
cvp->cycle = d4;
|
||
}
|
||
|
||
// 对一条read计算协变量(该协变量被上一个read用过)
|
||
static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues, int thid);
|
||
}; |