FastBQSR/src/bqsr/covariate.h

320 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: 在bqsr过程中计算协变量相关的类和方法
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/08
*/
#pragma once
#include <spdlog/spdlog.h>
#include <cstdint>
#include <cstdlib>
#include <map>
#include <string>
#include <vector>
#include "bqsr_args.h"
#include "util/bam_wrap.h"
#include "util/sam_data.h"
using std::map;
using std::string;
using std::vector;
// 协变量的值, 4个协变量
struct CovariateValues {
int readGroup = 0;
int baseQuality = 0;
int context = -1;
int cycle = -1;
void clear() {
readGroup = 0;
baseQuality = 0;
context = -1;
cycle = -1;
}
};
/**
* This is where we store the per-read covariates, also indexed by (event type) and (read position).
* Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
* For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
*/
// 三维数组第一维是event type第二维是read position第三维是协变量数组(结构体)(base quality, context, cycle)
typedef vector<vector<CovariateValues>> PerReadCovariateMatrix;
// 变异类型snp, insert, deletion
struct EventTypeValue {
int index; // 在协变量数组中对应的索引
char representation;
string longRepresentation;
bool operator==(const EventTypeValue& a) const { return a.index == index; }
};
struct EventType {
static constexpr int EVENT_SIZE = 3;
static EventTypeValue BASE_SUBSTITUTION;
static EventTypeValue BASE_INSERTION;
static EventTypeValue BASE_DELETION;
static vector<EventTypeValue> EVENTS;
};
// Read group协变量
struct ReadGroupCovariate {
static constexpr int index = 0; // 在协变量数组中的索引位置
static map<string, int> RgToId; // read group name到id的映射
static map<int, string> IdToRg; // id到read group name的映射
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Base quality协变量
struct BaseQualityCovariate {
static constexpr int index = 1; // 在协变量数组中的索引位置
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Context协变量
struct ContextCovariate {
static constexpr int index = 2; // 在协变量数组中的索引位置
static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
static constexpr int LENGTH_BITS = 4;
static constexpr int LENGTH_MASK = 15;
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
static constexpr int MAX_DNA_CONTEXT = 13;
static int mismatchesContextSize;
static int indelsContextSize;
static int mismatchesKeyMask;
static int indelsKeyMask;
static uint8_t lowQualTail;
static int baseIndexMap[256];
static void InitContextCovariate(BQSRArg& p) {
mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
indelsContextSize = p.INDELS_CONTEXT_SIZE;
if (mismatchesContextSize > MAX_DNA_CONTEXT) {
spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
exit(1);
}
if (indelsContextSize > MAX_DNA_CONTEXT) {
spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
exit(1);
}
lowQualTail = p.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
exit(1);
}
mismatchesKeyMask = CreateMask(mismatchesContextSize);
indelsKeyMask = CreateMask(indelsContextSize);
// init baseIndexMap
for (int i = 0; i < 256; ++i) {
baseIndexMap[i] = -1;
}
baseIndexMap['A'] = 0;
baseIndexMap['a'] = 0;
baseIndexMap['*'] = 0;
baseIndexMap['C'] = 1;
baseIndexMap['c'] = 1;
baseIndexMap['G'] = 2;
baseIndexMap['g'] = 2;
baseIndexMap['T'] = 3;
baseIndexMap['t'] = 3;
}
static int MaximumKeyValue() {
int length = max(mismatchesContextSize, indelsContextSize);
int key = length;
int bitOffset = LENGTH_BITS;
for (int i = 0; i < length; ++i) {
key |= (3 << bitOffset);
bitOffset += 2;
}
return key;
}
static int CreateMask(int contextSize) {
int mask = 0;
// create 2*contextSize worth of bits
for (int i = 0; i < contextSize; i++) {
mask = (mask << 2) | 3;
}
// shift 4 bits to mask out the bits used to encode the length
return mask << LENGTH_BITS;
}
/**
* Helper method: computes the correct offset to use in computations of covariate values.
* @param isNegativeStrand is the read on the negative strand
* @param offset 0-based index of the base in the read
* @param readLength length of the read
* @return
*/
static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
return isNegativeStrand ? (readLength - offset - 1) : offset;
}
static char baseIndexToSimpleBase(const int baseIndex) {
switch (baseIndex) {
case 0:
return 'A';
case 1:
return 'C';
case 2:
return 'G';
case 3:
return 'T';
default:
return '.';
}
}
/**
* Converts a key into the dna string representation.
*
* @param key the key representing the dna sequence
* @return the dna sequence represented by the key
*/
static string ContextFromKey(const int key) {
int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context
int mask = 48; // use the mask to pull out bases
int offset = LENGTH_BITS;
string dna;
for (int i = 0; i < length; i++) {
int baseIndex = (key & mask) >> offset;
dna.push_back(baseIndexToSimpleBase(baseIndex));
mask <<= 2; // move the mask over to the next 2 bits
offset += 2;
}
return dna;
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
// Creates a int representation of a given dna string.
static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);
// 设置协变量的值
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Cycle协变量
struct CycleCovariate {
static constexpr int index = 3; // 在协变量数组中的索引位置
static int MAXIMUM_CYCLE_VALUE;
static constexpr int CUSHION_FOR_INDELS = 4;
static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
/**
* Encodes the cycle number as a key.
*/
static int KeyFromCycle(const int cycle, const int maxCycle) {
// no negative values because values must fit into the first few bits of the long
int result = std::abs(cycle);
if (result > maxCycle) {
spdlog::error(
"The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument "
"(when creating the recalibration table in "
"BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
maxCycle, result);
exit(1);
}
result <<= 1; // shift so we can add the "sign" bit
if (cycle < 0) {
result++; // negative cycles get the lower-most bit set
}
return result;
}
/**
* Decodes the cycle number from the key.
*/
static int CycleFromKey(const int key) {
int cycle = key >> 1; // shift so we can remove the "sign" bit
if ((key & 1) != 0) { // is the last bit set?
cycle *= -1; // then the cycle is negative
}
return cycle;
}
// Computes the encoded value of CycleCovariate's key for the given position at the read.
static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// 协变量相关的工具类
struct CovariateUtils {
static constexpr int MAX_READ_LENGTH = 300; // 最大read长度
static constexpr int NUM_COVARIATES = 4;
// 初始化PerReadCovariateMatrix
static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
matrix.resize(EventType::EVENT_SIZE);
for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
matrix[event_type].resize(MAX_READ_LENGTH);
}
}
// 设置协变量
static inline void SetReadGroup(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].readGroup = mismatch;
matrix[EventType::BASE_INSERTION.index][readOffset].readGroup = insertion;
matrix[EventType::BASE_DELETION.index][readOffset].readGroup = deletion;
}
static inline void SetBaseQual(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].baseQuality = mismatch;
matrix[EventType::BASE_INSERTION.index][readOffset].baseQuality = insertion;
matrix[EventType::BASE_DELETION.index][readOffset].baseQuality = deletion;
}
static inline void SetContext(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].context = mismatch;
matrix[EventType::BASE_INSERTION.index][readOffset].context = insertion;
matrix[EventType::BASE_DELETION.index][readOffset].context = deletion;
}
static inline void SetCycle(int mismatch, int insertion, int deletion, int readOffset, PerReadCovariateMatrix& matrix) {
matrix[EventType::BASE_SUBSTITUTION.index][readOffset].cycle = mismatch;
matrix[EventType::BASE_INSERTION.index][readOffset].cycle = insertion;
matrix[EventType::BASE_DELETION.index][readOffset].cycle = deletion;
}
static inline void SetAllCovs(int m1, int i1, int d1, int m2, int i2, int d2, int m3, int i3, int d3, int m4, int i4, int d4, int readOffset,
PerReadCovariateMatrix& matrix) {
CovariateValues *cvp = &matrix[EventType::BASE_SUBSTITUTION.index][readOffset];
cvp->readGroup = m1;
cvp->baseQuality = m2;
cvp->context = m3;
cvp->cycle = m4;
cvp = &matrix[EventType::BASE_INSERTION.index][readOffset];
cvp->readGroup = i1;
cvp->baseQuality = i2;
cvp->context = i3;
cvp->cycle = i4;
cvp = &matrix[EventType::BASE_DELETION.index][readOffset];
cvp->readGroup = d1;
cvp->baseQuality = d2;
cvp->context = d3;
cvp->cycle = d4;
}
// 对一条read计算协变量该协变量被上一个read用过
static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues, int thid);
};