调整了计算read group的方式,当read group只有一个时,不需要计算

This commit is contained in:
zzh 2026-01-01 09:40:46 +08:00
parent 745963831d
commit 985875ebac
4 changed files with 20 additions and 20 deletions

View File

@ -5,5 +5,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
# set(CMAKE_BUILD_TYPE Debug)
# set(CMAKE_BUILD_TYPE Release)
# add_definitions(-DSHOW_PERF=1)
add_definitions(-DSHOW_PERF)
#add_definitions(-DSHOW_PERF=1)
add_subdirectory(src)

View File

@ -28,7 +28,8 @@ struct BQSRArg {
int NUM_THREADS = 1;
size_t MAX_MEM = ((size_t)1) << 30; // 1G
size_t MAX_MEM = ((size_t)1 << 30);
// + ((size_t)1 << 29); // 1G 应该按照每个线程多少内存来设置
bool DUPLEX_IO = true; //

View File

@ -121,18 +121,21 @@ void CovariateUtils::ComputeCovariates(SamData& sd, sam_hdr_t* header, PerReadCo
// ReadGroupCovariate 协变量的方法
void ReadGroupCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
uint8_t *rgStr = bam_aux_get(sd.bw->b, "RG");
char* rgVal = nullptr;
if (rgStr) rgVal = bam_aux2Z(rgStr);
int key = 0;
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
spdlog::error("The RG tag value for read can not be found in header!");
} else {
key = RgToId[rgVal];
}
for (int i = 0; i < sd.read_len; ++i) {
CovariateUtils::SetReadGroup(key, key, key, i, values);
}
if (RgToId.size() > 1) { // 只有当bam文件有多个read group时再读取每个read的RG tag
uint8_t* rgStr = bam_aux_get(sd.bw->b, "RG");
char* rgVal = nullptr;
if (rgStr)
rgVal = bam_aux2Z(rgStr);
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
spdlog::error("The RG tag value for read can not be found in header!");
} else {
key = RgToId[rgVal];
for (int i = 0; i < sd.read_len; ++i) {
CovariateUtils::SetReadGroup(key, key, key, i, values);
}
}
} // 因为read group默认就是0,所以只有一个read group时不需要设置
}
// BaseQualityCovariate 协变量的方法
@ -332,11 +335,9 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
const int originalReadLength = sd.read_len;
// store the original bases and then write Ns over low quality ones
// string strandedClippedBases(sd.bases.arr.data(), sd.read_len);
auto &strandedClippedBases = sd.strandedClippedBases;
strandedClippedBases.copy(sd.bases);
// GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的
GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 明明我之前看到过这个30的
// spdlog::info("bases: {}", strandedClippedBases);
auto& nBasePairContextAtEachCycle = sd.nBasePairContextAtEachCycle;
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
@ -346,6 +347,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
// since the context covariate may not span the entire set of values in read covariates
// due to the clipping of the low quality bases
// 这段代码应该不会执行因为clip with N不会改变read长度
if (readLengthAfterClipping != originalReadLength) {
// don't bother zeroing out if we are going to overwrite the whole array
for (int i = 0; i < originalReadLength; i++) {

View File

@ -116,10 +116,6 @@ struct RecalFuncs {
// int idx = 0;
PROF_START(TP_read_vcf);
for (auto& vcf : vcfs) {
// 为啥多线程环境会出现deque的front和[0]不一样?好像是调试的时候的问题,实际运行时没再出现
// if (vcf.knownSites.front().left != vcf.knownSites[0].left || vcf.knownSites.front().right != vcf.knownSites[0].right)
// spdlog::error("front is different with [0]: {} - {}", vcf.knownSites.front().left, vcf.knownSites[0].left);
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam不需要读取
// 清理旧的interval只有此时才有清理的必要
while (!vcf.knownSites.empty()) {