2025-12-20 16:35:45 +08:00
|
|
|
|
/*
|
|
|
|
|
|
Description: 在bqsr过程中,计算协变量相关的类和方法
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2025/12/08
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
#include <spdlog/spdlog.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include <cstdint>
|
|
|
|
|
|
#include <cstdlib>
|
|
|
|
|
|
#include <map>
|
|
|
|
|
|
#include <string>
|
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
|
|
|
|
#include "bqsr_args.h"
|
|
|
|
|
|
#include "util/bam_wrap.h"
|
|
|
|
|
|
|
|
|
|
|
|
using std::map;
|
|
|
|
|
|
using std::string;
|
|
|
|
|
|
using std::vector;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2025-12-28 14:33:45 +08:00
|
|
|
|
* This is where we store the per-read covariates, also indexed by (event type) and (read position).
|
2025-12-20 16:35:45 +08:00
|
|
|
|
* Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
|
|
|
|
|
|
* For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
|
|
|
|
|
|
*/
|
|
|
|
|
|
typedef vector<vector<vector<int>>> PerReadCovariateMatrix;
|
|
|
|
|
|
|
|
|
|
|
|
// 变异类型(snp, insert, deletion)
|
|
|
|
|
|
struct EventTypeValue {
|
|
|
|
|
|
int index; // 在协变量数组中对应的索引
|
|
|
|
|
|
char representation;
|
|
|
|
|
|
string longRepresentation;
|
2025-12-28 14:33:45 +08:00
|
|
|
|
bool operator==(const EventTypeValue& a) const { return a.index == index; }
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct EventType {
|
|
|
|
|
|
static constexpr int EVENT_SIZE = 3;
|
|
|
|
|
|
static EventTypeValue BASE_SUBSTITUTION;
|
|
|
|
|
|
static EventTypeValue BASE_INSERTION;
|
|
|
|
|
|
static EventTypeValue BASE_DELETION;
|
2025-12-28 14:33:45 +08:00
|
|
|
|
static vector<EventTypeValue> EVENTS;
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// 协变量相关的工具类
|
|
|
|
|
|
struct CovariateUtils {
|
|
|
|
|
|
static constexpr int MAX_READ_LENGTH = 300; // 最大read长度
|
|
|
|
|
|
static constexpr int NUM_COVARIATES = 4;
|
|
|
|
|
|
|
|
|
|
|
|
// 初始化PerReadCovariateMatrix
|
|
|
|
|
|
static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
|
|
|
|
|
|
matrix.resize(EventType::EVENT_SIZE);
|
|
|
|
|
|
for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
|
|
|
|
|
|
matrix[event_type].resize(MAX_READ_LENGTH);
|
|
|
|
|
|
for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) {
|
|
|
|
|
|
matrix[event_type][pos].resize(NUM_COVARIATES, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 设置协变量
|
|
|
|
|
|
static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) {
|
|
|
|
|
|
matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch;
|
|
|
|
|
|
matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion;
|
|
|
|
|
|
matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Read group协变量
|
|
|
|
|
|
struct ReadGroupCovariate {
|
|
|
|
|
|
static constexpr int index = 0; // 在协变量数组中的索引位置
|
|
|
|
|
|
static map<string, int> RgToId; // read group name到id的映射
|
|
|
|
|
|
static map<int, string> IdToRg; // id到read group name的映射
|
|
|
|
|
|
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Base quality协变量
|
|
|
|
|
|
struct BaseQualityCovariate {
|
|
|
|
|
|
static constexpr int index = 1; // 在协变量数组中的索引位置
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Context协变量
|
|
|
|
|
|
struct ContextCovariate {
|
|
|
|
|
|
static constexpr int index = 2; // 在协变量数组中的索引位置
|
|
|
|
|
|
static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
|
|
|
|
|
|
static constexpr int LENGTH_BITS = 4;
|
|
|
|
|
|
static constexpr int LENGTH_MASK = 15;
|
|
|
|
|
|
|
|
|
|
|
|
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
|
|
|
|
|
|
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
|
|
|
|
|
|
static constexpr int MAX_DNA_CONTEXT = 13;
|
|
|
|
|
|
|
|
|
|
|
|
static int mismatchesContextSize;
|
|
|
|
|
|
static int indelsContextSize;
|
|
|
|
|
|
static int mismatchesKeyMask;
|
|
|
|
|
|
static int indelsKeyMask;
|
|
|
|
|
|
static uint8_t lowQualTail;
|
|
|
|
|
|
|
|
|
|
|
|
static int baseIndexMap[256];
|
|
|
|
|
|
|
|
|
|
|
|
static void InitContextCovariate(BQSRArg& p) {
|
|
|
|
|
|
mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
|
|
|
|
|
|
indelsContextSize = p.INDELS_CONTEXT_SIZE;
|
|
|
|
|
|
if (mismatchesContextSize > MAX_DNA_CONTEXT) {
|
|
|
|
|
|
spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
|
|
|
|
|
|
exit(1);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (indelsContextSize > MAX_DNA_CONTEXT) {
|
|
|
|
|
|
spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
|
|
|
|
|
|
exit(1);
|
|
|
|
|
|
}
|
|
|
|
|
|
lowQualTail = p.LOW_QUAL_TAIL;
|
|
|
|
|
|
if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
|
|
|
|
|
|
spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
|
|
|
|
|
|
exit(1);
|
|
|
|
|
|
}
|
|
|
|
|
|
mismatchesKeyMask = CreateMask(mismatchesContextSize);
|
|
|
|
|
|
indelsKeyMask = CreateMask(indelsContextSize);
|
|
|
|
|
|
|
|
|
|
|
|
// init baseIndexMap
|
|
|
|
|
|
for (int i = 0; i < 256; ++i) {
|
|
|
|
|
|
baseIndexMap[i] = -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
baseIndexMap['A'] = 0;
|
|
|
|
|
|
baseIndexMap['a'] = 0;
|
|
|
|
|
|
baseIndexMap['*'] = 0;
|
|
|
|
|
|
baseIndexMap['C'] = 1;
|
|
|
|
|
|
baseIndexMap['c'] = 1;
|
|
|
|
|
|
baseIndexMap['G'] = 2;
|
|
|
|
|
|
baseIndexMap['g'] = 2;
|
|
|
|
|
|
baseIndexMap['T'] = 3;
|
|
|
|
|
|
baseIndexMap['t'] = 3;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-28 14:33:45 +08:00
|
|
|
|
static int MaximumKeyValue() {
|
|
|
|
|
|
int length = max(mismatchesContextSize, indelsContextSize);
|
|
|
|
|
|
int key = length;
|
|
|
|
|
|
int bitOffset = LENGTH_BITS;
|
|
|
|
|
|
for (int i = 0; i < length; ++i) {
|
|
|
|
|
|
key |= (3 << bitOffset);
|
|
|
|
|
|
bitOffset += 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
return key;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-20 16:35:45 +08:00
|
|
|
|
static int CreateMask(int contextSize) {
|
|
|
|
|
|
int mask = 0;
|
|
|
|
|
|
// create 2*contextSize worth of bits
|
|
|
|
|
|
for (int i = 0; i < contextSize; i++) {
|
|
|
|
|
|
mask = (mask << 2) | 3;
|
|
|
|
|
|
}
|
|
|
|
|
|
// shift 4 bits to mask out the bits used to encode the length
|
|
|
|
|
|
return mask << LENGTH_BITS;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Helper method: computes the correct offset to use in computations of covariate values.
|
|
|
|
|
|
* @param isNegativeStrand is the read on the negative strand
|
|
|
|
|
|
* @param offset 0-based index of the base in the read
|
|
|
|
|
|
* @param readLength length of the read
|
|
|
|
|
|
* @return
|
|
|
|
|
|
*/
|
|
|
|
|
|
static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
|
|
|
|
|
|
return isNegativeStrand ? (readLength - offset - 1) : offset;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-28 14:33:45 +08:00
|
|
|
|
static char baseIndexToSimpleBase(const int baseIndex) {
|
|
|
|
|
|
switch (baseIndex) {
|
|
|
|
|
|
case 0:
|
|
|
|
|
|
return 'A';
|
|
|
|
|
|
case 1:
|
|
|
|
|
|
return 'C';
|
|
|
|
|
|
case 2:
|
|
|
|
|
|
return 'G';
|
|
|
|
|
|
case 3:
|
|
|
|
|
|
return 'T';
|
|
|
|
|
|
default:
|
|
|
|
|
|
return '.';
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Converts a key into the dna string representation.
|
|
|
|
|
|
*
|
|
|
|
|
|
* @param key the key representing the dna sequence
|
|
|
|
|
|
* @return the dna sequence represented by the key
|
|
|
|
|
|
*/
|
|
|
|
|
|
static string ContextFromKey(const int key) {
|
|
|
|
|
|
int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context
|
|
|
|
|
|
int mask = 48; // use the mask to pull out bases
|
|
|
|
|
|
int offset = LENGTH_BITS;
|
|
|
|
|
|
|
|
|
|
|
|
string dna;
|
|
|
|
|
|
for (int i = 0; i < length; i++) {
|
|
|
|
|
|
int baseIndex = (key & mask) >> offset;
|
|
|
|
|
|
dna.push_back(baseIndexToSimpleBase(baseIndex));
|
|
|
|
|
|
mask <<= 2; // move the mask over to the next 2 bits
|
|
|
|
|
|
offset += 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
return dna;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-20 16:35:45 +08:00
|
|
|
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static void GetStrandedClippedBytes(SamData& ad, string& clippedBases, uint8_t lowQTail);
|
2025-12-20 16:35:45 +08:00
|
|
|
|
// Creates a int representation of a given dna string.
|
|
|
|
|
|
static int KeyFromContext(const string& dna, const int start, const int end);
|
|
|
|
|
|
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
|
|
|
|
|
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
|
|
|
|
|
|
|
|
|
|
|
|
// 设置协变量的值
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Cycle协变量
|
|
|
|
|
|
struct CycleCovariate {
|
|
|
|
|
|
static constexpr int index = 3; // 在协变量数组中的索引位置
|
|
|
|
|
|
static int MAXIMUM_CYCLE_VALUE;
|
|
|
|
|
|
static constexpr int CUSHION_FOR_INDELS = 4;
|
|
|
|
|
|
|
|
|
|
|
|
static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
|
|
|
|
|
|
|
2025-12-28 14:33:45 +08:00
|
|
|
|
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
|
|
|
|
|
|
|
2025-12-20 16:35:45 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Encodes the cycle number as a key.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static int KeyFromCycle(const int cycle, const int maxCycle) {
|
|
|
|
|
|
// no negative values because values must fit into the first few bits of the long
|
|
|
|
|
|
int result = std::abs(cycle);
|
|
|
|
|
|
if (result > maxCycle) {
|
|
|
|
|
|
spdlog::error(
|
|
|
|
|
|
"The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument "
|
|
|
|
|
|
"(when creating the recalibration table in "
|
|
|
|
|
|
"BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
|
|
|
|
|
|
maxCycle, result);
|
|
|
|
|
|
exit(1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result <<= 1; // shift so we can add the "sign" bit
|
|
|
|
|
|
if (cycle < 0) {
|
|
|
|
|
|
result++; // negative cycles get the lower-most bit set
|
|
|
|
|
|
}
|
|
|
|
|
|
return result;
|
2025-12-28 14:33:45 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Decodes the cycle number from the key.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static int CycleFromKey(const int key) {
|
|
|
|
|
|
int cycle = key >> 1; // shift so we can remove the "sign" bit
|
|
|
|
|
|
if ((key & 1) != 0) { // is the last bit set?
|
|
|
|
|
|
cycle *= -1; // then the cycle is negative
|
|
|
|
|
|
}
|
|
|
|
|
|
return cycle;
|
|
|
|
|
|
}
|
2025-12-20 16:35:45 +08:00
|
|
|
|
|
|
|
|
|
|
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
2025-12-20 16:35:45 +08:00
|
|
|
|
|
2025-12-29 16:48:55 +08:00
|
|
|
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
2025-12-28 14:33:45 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// 好像不需要
|
|
|
|
|
|
struct StandardCovariateList {
|
|
|
|
|
|
ReadGroupCovariate readGroupCovariate;
|
|
|
|
|
|
BaseQualityCovariate qualityScoreCovariate;
|
2025-12-20 16:35:45 +08:00
|
|
|
|
};
|