From 65878bbf9609dcd08d80b1393e8cf60c253b2715 Mon Sep 17 00:00:00 2001 From: zzh Date: Wed, 31 Dec 2025 11:16:13 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=B95.1G=E7=9A=84=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=EF=BC=8C=E4=B8=B2=E8=A1=8C=E7=BB=93=E6=9E=9C=E6=AD=A3=E7=A1=AE?= =?UTF-8?q?=E4=BA=86=EF=BC=8C=E5=B9=B6=E8=A1=8C=E7=BB=93=E6=9E=9C=E8=BF=98?= =?UTF-8?q?=E6=B2=A1=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bqsr/bqsr_entry.cpp | 41 ++++++++++++++++++++++------------------- src/bqsr/recal_funcs.h | 4 ++-- src/util/base_utils.cpp | 3 ++- src/util/base_utils.h | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index e965ffc..66ff367 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -189,22 +189,24 @@ int SerialBQSR(AuxVar &aux) { sd.applyTransformations(); PROF_END(gprof[GP_clip_read], clip_read); - const char* qname = bam_get_qname(sd.bw->b); + //const char* qname = bam_get_qname(sd.bw->b); // fprintf(gf[4], "%ld %d %d %d\n", sd.rid, sd.read_len, 1 + BamWrap::bam_pos(sd.start_pos), 1 + BamWrap::bam_pos(sd.end_pos)); - // fprintf(gf[4], "%s %d %d %d %d\n", qname, sd.bw->b->core.flag, sd.read_len, 1 + BamWrap::bam_pos(sd.start_pos), 1 + BamWrap::bam_pos(sd.end_pos)); + //fprintf(gf[4], "%s %d %d %d %d\n", qname, sd.bw->b->core.flag, sd.read_len, 1 + BamWrap::bam_pos(sd.start_pos), 1 + BamWrap::bam_pos(sd.end_pos)); // 6. 更新每个read的platform信息,好像没啥用,暂时忽略 + // 这里计算snp和indel有点问题,snp和del结果不对,白天调试一下 const int nErrors = RecalFuncs::calculateIsSNPOrIndel(aux, sd, isSNP, isIns, isDel); - // fprintf(gf[0], "%d\t", sd.read_len); - // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]); - // fprintf(gf[0], "\n"); - // fprintf(gf[1], "%d\t", sd.read_len); - // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]); - // fprintf(gf[1], "\n"); - // fprintf(gf[2], "%d\t", sd.read_len); - // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]); - // fprintf(gf[2], "\n"); + // fprintf(gf[4], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[4], "%d ", skips[ii] ? 1 : 0); + // fprintf(gf[4], "\n"); + + // fprintf(gf[0], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]); fprintf(gf[0], "\n"); + // fprintf(gf[1], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]); fprintf(gf[1], "\n"); + // fprintf(gf[2], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]); fprintf(gf[2], "\n"); // 7. 计算baqArray // BAQ = base alignment quality @@ -257,9 +259,9 @@ int SerialBQSR(AuxVar &aux) { //spdlog::info("{}", ss.str()); PROF_GP_END(read_vcf); - // fprintf(gf[4], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); - // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[4], "%d ", skips[ii] ? 1 : 0); - // fprintf(gf[4], "\n"); + // fprintf(gf[3], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[3], "%d ", skips[ii] ? 1 : 0); + // fprintf(gf[3], "\n"); // fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len); // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0); @@ -290,7 +292,7 @@ int SerialBQSR(AuxVar &aux) { collapseQualityScoreTableToReadGroupTable(recalTables.readGroupTable, recalTables.qualityScoreTable); roundTableValues(recalTables); -#if 1 +#if 0 printRecalTables(recalTables); #endif @@ -304,8 +306,8 @@ int SerialBQSR(AuxVar &aux) { } // 多线程处理bam数据, tmd是乱序的? -// static void thread_worker(void* data, long idx, int tid, int steal) { -static void thread_worker(void* data, long idx, int tid) { +static void thread_worker(void* data, long idx, int tid, int steal) { +// static void thread_worker(void* data, long idx, int tid) { AuxVar& aux = (*(vector*)data)[tid]; auto& readCovariates = aux.readCovariates; RecalTables& recalTables = aux.recalTables; @@ -315,7 +317,7 @@ static void thread_worker(void* data, long idx, int tid) { StableArray&snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors; StableArray& skips = aux.skips; // 该位置是否是已知位点 auto &bams = *aux.bamArr; - // if (steal) for (auto& vcf : aux.vcfArr) vcf.knownSites.clear(); + if (steal) for (auto& vcf : aux.vcfArr) vcf.knownSites.clear(); #if 1 int startIdx = idx * aux.BAM_BLOCK_NUM; int stopIdx = std::min((size_t)(idx + 1) * aux.BAM_BLOCK_NUM, bams.size()); @@ -405,7 +407,7 @@ int ParallelBQSR(vector& auxArr) { spdlog::info("{} reads processed in {} round", readNum, round); #if 1 - kt_for_no_steal(auxArr.size(), thread_worker, &auxArr, (readNum + AuxVar::BAM_BLOCK_NUM - 1) / AuxVar::BAM_BLOCK_NUM); + kt_for_steal(auxArr.size(), thread_worker, &auxArr, (readNum + AuxVar::BAM_BLOCK_NUM - 1) / AuxVar::BAM_BLOCK_NUM); #else kt_for_steal(auxArr.size(), thread_worker, &auxArr, auxArr.size()); #endif @@ -514,6 +516,7 @@ static void globalInit() { RecalDatum::StaticInit(); QualityUtils::StaticInit(); MathUtils::StaticInit(); + BaseUtils::StaticInit(); // 初始化需要计算的event types RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables); diff --git a/src/bqsr/recal_funcs.h b/src/bqsr/recal_funcs.h index c5808c8..41a53f1 100644 --- a/src/bqsr/recal_funcs.h +++ b/src/bqsr/recal_funcs.h @@ -37,7 +37,7 @@ struct RecalFuncs { PROF_START(read_ref); read_ref_base(aux, interval.left, interval); PROF_GP_END(read_ref); - string refBases(aux.ref_seq); + const char *refBases = aux.ref_seq; // 2. 遍历cigar,计算每个碱基是否是SNP或Indel PROF_START(calc_snp); @@ -48,7 +48,7 @@ struct RecalFuncs { if (c == 'M' || c == '=' || c == 'X') { for (int j = 0; j < len; ++j) { // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip - int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1; + int snpInt = BaseUtils::basesAreEqual(sd.bases[readPos], refBases[refPos]) ? 0 : 1; isSNP[readPos] = snpInt; nEvents += snpInt; readPos++; diff --git a/src/util/base_utils.cpp b/src/util/base_utils.cpp index 01ca770..01b9d00 100644 --- a/src/util/base_utils.cpp +++ b/src/util/base_utils.cpp @@ -1,3 +1,4 @@ #include "base_utils.h" -const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; \ No newline at end of file +const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; +int BaseUtils::baseIndexMap[256]; \ No newline at end of file diff --git a/src/util/base_utils.h b/src/util/base_utils.h index d179b82..6919f86 100644 --- a/src/util/base_utils.h +++ b/src/util/base_utils.h @@ -8,7 +8,28 @@ */ #pragma once +#include + struct BaseUtils { + + static void StaticInit() { + // init baseIndexMap + for (int i = 0; i < 256; ++i) { + baseIndexMap[i] = -1; + } + baseIndexMap['A'] = 0; + baseIndexMap['a'] = 0; + baseIndexMap['*'] = 0; + baseIndexMap['C'] = 1; + baseIndexMap['c'] = 1; + baseIndexMap['G'] = 2; + baseIndexMap['g'] = 2; + baseIndexMap['T'] = 3; + baseIndexMap['t'] = 3; + } + + static int baseIndexMap[256]; + // uint8_t转碱基字符 static const char cBaseToChar[16]; @@ -17,4 +38,19 @@ struct BaseUtils { // 该操作符是否消耗参考基因组的碱基 static bool consumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; } + + /** + * Converts a simple base to a base index + * + * @param base [AaCcGgTt] + * @return 0, 1, 2, 3, or -1 if the base can't be understood + */ + static inline int simpleBaseToBaseIndex(const char base) { + // we want to make sure we treat the base as an unsigned byte....so that we can access this array with it. + return baseIndexMap[(uint8_t)base]; + } + + static bool basesAreEqual(const char base1, const char base2) { + return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2); + } }; \ No newline at end of file