diff --git a/CMakeLists.txt b/CMakeLists.txt index e0e4f84..541c84f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,5 +5,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # set(CMAKE_BUILD_TYPE Debug) # set(CMAKE_BUILD_TYPE Release) -# add_definitions(-DSHOW_PERF=1) +add_definitions(-DSHOW_PERF) +#add_definitions(-DSHOW_PERF=1) add_subdirectory(src) diff --git a/src/bqsr/bqsr_args.h b/src/bqsr/bqsr_args.h index c2407d4..feb4299 100644 --- a/src/bqsr/bqsr_args.h +++ b/src/bqsr/bqsr_args.h @@ -28,7 +28,8 @@ struct BQSRArg { int NUM_THREADS = 1; - size_t MAX_MEM = ((size_t)1) << 30; // 1G + size_t MAX_MEM = ((size_t)1 << 30); + // + ((size_t)1 << 29); // 1G 应该按照每个线程多少内存来设置 bool DUPLEX_IO = true; // diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index ec58306..6a7e121 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -121,18 +121,21 @@ void CovariateUtils::ComputeCovariates(SamData& sd, sam_hdr_t* header, PerReadCo // ReadGroupCovariate 协变量的方法 void ReadGroupCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { - uint8_t *rgStr = bam_aux_get(sd.bw->b, "RG"); - char* rgVal = nullptr; - if (rgStr) rgVal = bam_aux2Z(rgStr); int key = 0; - if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) { - spdlog::error("The RG tag value for read can not be found in header!"); - } else { - key = RgToId[rgVal]; - } - for (int i = 0; i < sd.read_len; ++i) { - CovariateUtils::SetReadGroup(key, key, key, i, values); - } + if (RgToId.size() > 1) { // 只有当bam文件有多个read group时,再读取每个read的RG tag + uint8_t* rgStr = bam_aux_get(sd.bw->b, "RG"); + char* rgVal = nullptr; + if (rgStr) + rgVal = bam_aux2Z(rgStr); + if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) { + spdlog::error("The RG tag value for read can not be found in header!"); + } else { + key = RgToId[rgVal]; + for (int i = 0; i < sd.read_len; ++i) { + CovariateUtils::SetReadGroup(key, key, key, i, values); + } + } + } // 因为read group默认就是0,所以只有一个read group时,不需要设置 } // BaseQualityCovariate 协变量的方法 @@ -332,11 +335,9 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar const int originalReadLength = sd.read_len; // store the original bases and then write Ns over low quality ones - // string strandedClippedBases(sd.bases.arr.data(), sd.read_len); auto &strandedClippedBases = sd.strandedClippedBases; strandedClippedBases.copy(sd.bases); - // GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值 - GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的? + GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 明明我之前看到过这个30的? // spdlog::info("bases: {}", strandedClippedBases); auto& nBasePairContextAtEachCycle = sd.nBasePairContextAtEachCycle; GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle); @@ -346,6 +347,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar // this is necessary to ensure that we don't keep historical data in the ReadCovariates values // since the context covariate may not span the entire set of values in read covariates // due to the clipping of the low quality bases + // 这段代码应该不会执行,因为clip with N不会改变read长度 if (readLengthAfterClipping != originalReadLength) { // don't bother zeroing out if we are going to overwrite the whole array for (int i = 0; i < originalReadLength; i++) { diff --git a/src/bqsr/recal_funcs.h b/src/bqsr/recal_funcs.h index b7682eb..6b23da0 100644 --- a/src/bqsr/recal_funcs.h +++ b/src/bqsr/recal_funcs.h @@ -116,10 +116,6 @@ struct RecalFuncs { // int idx = 0; PROF_START(TP_read_vcf); for (auto& vcf : vcfs) { - // 为啥多线程环境会出现,deque的front和[0]不一样?好像是调试的时候的问题,实际运行时没再出现 - // if (vcf.knownSites.front().left != vcf.knownSites[0].left || vcf.knownSites.front().right != vcf.knownSites[0].right) - // spdlog::error("front is different with [0]: {} - {}", vcf.knownSites.front().left, vcf.knownSites[0].left); - if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam,不需要读取 // 清理旧的interval,只有此时才有清理的必要 while (!vcf.knownSites.empty()) {