diff --git a/CMakeLists.txt b/CMakeLists.txt index 541c84f..a7a4153 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,5 +6,5 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # set(CMAKE_BUILD_TYPE Debug) # set(CMAKE_BUILD_TYPE Release) add_definitions(-DSHOW_PERF) -#add_definitions(-DSHOW_PERF=1) +# add_definitions(-DSHOW_PERF=1) add_subdirectory(src) diff --git a/README.md b/README.md index 80a7c98..8d3ff4d 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,6 @@ 对GATK bqsr部分进行并行优化 +对应的GATK版本:The Genome Analysis Toolkit (GATK) v4.6.2.0-17-g2a1f41b-SNAPSHOT + Implement the same functions as GATK for BaseRecalibrator and ApplyBQSR \ No newline at end of file diff --git a/src/bqsr/apply_bqsr_entry.cpp b/src/bqsr/apply_bqsr_entry.cpp index e69de29..fa9dec0 100644 --- a/src/bqsr/apply_bqsr_entry.cpp +++ b/src/bqsr/apply_bqsr_entry.cpp @@ -0,0 +1,336 @@ +// 一. 读取并解析recalibrate tables + +// 二. 循环处理每一个read + +// 1. 计算read的协变量covs + +// 2. 根据read的协变量信息从recal tables获取对应的数据 + +// 3. 根据recal tables数据对read的每个碱基分别计算新的质量分数 + +// 4. 将计算后的质量分数赋值给read + +#include // in htslib +#include +#include + +#include "aux_arg.h" +#include "common_data.h" +#include "util/debug.h" +#include "util/profiling.h" +#include "recal_utils.h" +#include "recal_funcs.h" +#include "read_utils.h" + +using std::vector; + +namespace nsgv { +// 全局变量 for apply bqsr +samFile* gOutBamFp; // 输出文件, sam或者bam格式 +sam_hdr_t* gOutBamHeader; // 输出文件的header +RecalTables gRecalTables; // 保留一个全局的recalibrate tables就行了 +vector gQuantizedQuals; // 读取quantized info table信息得到的,第三列数据 +StableArray gRecalBams[2]; // 保存已经校正过质量分数的bam数据,双buffer +}; // namespace nsgv + +// 串行apply bqsr +int SerialApplyBQSR(AuxVar &aux) { + BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::applyBqsrReadFilterOut); + int64_t readNumSum = 0; + int round = 0; + + PerReadCovariateMatrix& readCovariates = aux.readCovariates; + // 一. 读取并解析recalibrate tables + auto& recalTables = *aux.bqsrTable; + + // 全局的校正后的bam数组 + auto& recalBams = nsgv::gRecalBams[0]; + + auto& sd = aux.sd; + + while (true) { + ++round; + // 一. 读取bam数据 + size_t readNum = 0; + PROF_START(GP_read); + if (inBamBuf.ReadStat() >= 0) + readNum = inBamBuf.ReadBam(); + PROF_GP_END(GP_read); + if (readNum < 1) { + break; + } + auto bams = inBamBuf.GetBamArr(); + spdlog::info("{} reads processed in {} round", readNum, round); + + if (recalBams.size() < bams.size()) { + int start = recalBams.size(); + recalBams.resize(bams.size()); + for (int i = start; i < recalBams.size(); ++i) { + recalBams[i] = bam_init1(); + } + } + + // 二. 遍历每个bam(read)记录,进行处理 + for (int i = 0; i < bams.size(); ++i) { + if (bam_copy1(recalBams[i], bams[i]->b) == NULL) { + spdlog::error("Copy bam error"); + exit(1); + } + BamWrap* bw = bams[i]; + bw->b = recalBams[i]; // 注意这里的赋值,然后就可以对b进行更改了 + sd.init(); + sd.parseForApplyBQSR(bw); + sd.rid = i + aux.processedReads; + + // 1. 是否使用original quality来代替当前的quality + if (nsgv::gBqsrArg.useOriginalBaseQualities) + ReadUtils::resetOriginalBaseQualities(bw->b); + + // 2. 是否将当前的quality保存在tag OQ中 + if (nsgv::gBqsrArg.emitOriginalQuals) + ReadUtils::setOriginalBaseQualsIfNoOQ(bw->b); + + // 3. 计算read的协变量covs + PROF_START(GP_covariate); + CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0); + PROF_GP_END(GP_covariate); + + // clear indel qualities + ReadUtils::removeAttribute(bw->b, "BI"); + ReadUtils::removeAttribute(bw->b, "BD"); + + // 4. 检查read的readGroup tag是否包含在bqsr table里 + const char* readGroupId = ReadUtils::getReadGroupId(bw->b); + auto& covaritesForRead = readCovariates[EventType::BASE_SUBSTITUTION.index]; + uint8_t* recalibratedQuals = bam_get_qual(bw->b); + auto& preUpdateQuals = sd.base_quals; + int rgKey = -1; + if (ReadGroupCovariate::RgToId.find(std::string(readGroupId)) != ReadGroupCovariate::RgToId.end()) + rgKey = ReadGroupCovariate::RgToId[std::string(readGroupId)]; + if (rgKey == -1) { + if (nsgv::gBqsrArg.allowMissingReadGroups) { + // Given the way the recalibration code is implemented below, we cannot recalibrate a read with a + // read group that's not in the recal table. + for (int i = 0; i < sd.read_len; i++) { + //recalibratedQuals[i] = staticQuantizedMapping != null ? staticQuantizedMapping[preUpdateQuals[i]] : quantizedQuals.get(preUpdateQuals[i]); + recalibratedQuals[i] = nsgv::gQuantizedQuals[preUpdateQuals[i]]; + } + } else { + spdlog::error( + "Read group {} not found in the recalibration table. Use \"--allow-missing-read-group\" command line argument to ignore this " + "error.", + readGroupId); + exit(1); + } + } + + // 5. 根据recal tables数据对read的每个碱基分别计算新的质量分数 + auto& readGroupDatum = recalTables.readGroupTable(EventType::BASE_SUBSTITUTION.index, rgKey); + // Note: this loop is under very heavy use in applyBQSR. Keep it slim. + for (int offset = 0; offset < sd.read_len; offset++) { // recalibrate all bases in the read + // only recalibrate usable qualities (default: >= 6) (the original quality will come from the instrument -- reported quality) + if (recalibratedQuals[offset] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN) { + continue; + } + auto& covs = readCovariates[EventType::BASE_SUBSTITUTION.index][offset]; + // 根据read的协变量数据获取对应的bqsr数据 + auto& qualityScoreDatum = recalTables.qualityScoreTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality); + auto& contextDatum = recalTables.contextTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.context); + auto& cycleDatum = recalTables.cycleTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.cycle); + // 计算校正后的质量分数 + double priorQualityScore = + nsgv::gBqsrArg.globalQScorePrior > 0.0 ? nsgv::gBqsrArg.globalQScorePrior : readGroupDatum.getReportedQuality(); + double rawRecalibratedQualityScore = + RecalFuncs::hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, contextDatum, cycleDatum); + uint8_t qualIdx = RecalFuncs::getBoundedIntegerQual(rawRecalibratedQualityScore); + uint8_t quantizedQualityScore = qualIdx; // nsgv::gQuantizedQuals.at(qualIdx); + // TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be + // quantized once. + // recalibratedQuals[offset] = staticQuantizedMapping == null ? quantizedQualityScore : staticQuantizedMapping[quantizedQualityScore]; + recalibratedQuals[offset] = quantizedQualityScore; + } + + if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) { + spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str()); + sam_close(nsgv::gInBamFp); + sam_close(nsgv::gOutBamFp); + exit(1); + } + //break; + } + // break; + + readNumSum += readNum; + AuxVar::processedReads += readNum; + inBamBuf.ClearAll(); // 清空bam buf + } + + spdlog::info("read count: {}", readNumSum); + + return 0; +} + +// 并行 apply bqsr +int ParallelApplyBQSR(vector &auxArr) { + + return 0; +} + +// 在进行数据处理之前,初始化一些全局数据 +static void globalInit() { + open_debug_files(); + /* bam */ + nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr); + if (!nsgv::gInBamFp) { + spdlog::error("[{}] load sam/bam file failed.\n", __func__); + exit(1); + } + hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); + nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header + + /* 并行读取bam数据 */ + htsThreadPool htsPoolRead = {NULL, 0}; // + int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD); + htsPoolRead.pool = hts_tpool_init(readThreadNum); + if (!htsPoolRead.pool) { + spdlog::error("[{}] failed to set up thread pool", __LINE__); + sam_close(nsgv::gInBamFp); + exit(1); + } + hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); + + if (!nsgv::gInBamHeader->hrecs) { + if (sam_hdr_fill_hrecs(nsgv::gInBamHeader) != 0) { + spdlog::error("[{}] failed to read sam header", __LINE__); + sam_close(nsgv::gInBamFp); + exit(1); + } + } + /* 并行写入bam文件 */ + char modeout[12] = "wb"; + sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL); + nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout); + if (!nsgv::gOutBamFp) { + spdlog::error("[{}] create output sam/bam file failed.\n", __func__); + exit(1); + } + nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader); + // 修改输出文件的header + sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", PROGRAM_NAME, "VN", FASTBQSR_VERSION, "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL); + // 用不同的线程数量处理输出文件 + htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 + int writeThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_WRITE_MAX_THREAD); + htsPoolWrite.pool = hts_tpool_init(writeThreadNum); + if (!htsPoolWrite.pool) { + spdlog::error("[{}] failed to set up thread pool", __LINE__); + sam_close(nsgv::gInBamFp); + exit(1); + } + hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); + hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); + + if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) { + spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE); + sam_close(nsgv::gOutBamFp); + sam_close(nsgv::gInBamFp); + exit(1); + } + + // 输出index文件 + nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 是bai格式 + if ("sam" == Utils::getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) { + nsgv::gBqsrArg.INDEX_FILE = ""; + } + if (!nsgv::gBqsrArg.INDEX_FILE.empty()) { + int index_min_shift = 0; + if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) { + nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".csi"; + index_min_shift = 14; + } + if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, nsgv::gBqsrArg.INDEX_FILE.c_str()) < 0) { + spdlog::error("failed to open index \"{}\" for writing", nsgv::gBqsrArg.INDEX_FILE); + sam_close(nsgv::gOutBamFp); + sam_close(nsgv::gInBamFp); + exit(1); + } + } + + // 1. 协变量数据相关初始化 + ContextCovariate::InitContextCovariate(nsgv::gBqsrArg); + CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg); + + // 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue + // nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg); + + // 初始化AuxVar + nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS); + for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) { + nsgv::gAuxVars[i].header = nsgv::gInBamHeader; + nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str()); + if (nsgv::gAuxVars[i].faidx == 0) + error("[%s] fail to load the fasta index.\n", __func__); + // 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue + nsgv::gAuxVars[i].bqsrTable = &nsgv::gRecalTables; + CovariateUtils::InitPerReadCovMat(nsgv::gAuxVars[i].readCovariates); + } + + // 0. 初始化一些全局数据 + RecalDatum::StaticInit(); + QualityUtils::StaticInit(); + MathUtils::StaticInit(); + BaseUtils::StaticInit(); + + // 初始化需要计算的event types + RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables); + + // 2. 读取bam的read group + // if (nsgv::gInBamHeader->hrecs->nrg == 0) { + // spdlog::error("No RG tag found in the header!"); + // exit(1); + // } + + /* + // 应该是从bqsr table里读取read group信息 + for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) { + // spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name); + ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i; + ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name; + } + */ + + // 读取并解析recalibrate tables + RecalUtils::readRecalTables(nsgv::gBqsrArg.BQSR_FILE, nsgv::gBqsrArg, nsgv::gQuantizedQuals, nsgv::gRecalTables); +} + +// 全局资源释放 +static void globalDestroy() { + close_debug_files(); + + if (!nsgv::gBqsrArg.INDEX_FILE.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) { + spdlog::error("writing index failed"); + sam_close(nsgv::gOutBamFp); + sam_close(nsgv::gInBamFp); + exit(1); + } + + /* 关闭文件,收尾清理 */ + sam_close(nsgv::gOutBamFp); + sam_close(nsgv::gInBamFp); +} + +// entrance of BQSR phase-1 +int ApplyBQSR() { + int ret = 0; + + PROF_START(GP_whole_process); + globalInit(); +// if (nsgv::gBqsrArg.NUM_THREADS == 1) + ret = SerialApplyBQSR(nsgv::gAuxVars[0]); // 串行处理数据,生成recal bams +// else +// ret = ParallelApplyBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal bams + globalDestroy(); + PROF_GP_END(GP_whole_process); + + return ret; +} \ No newline at end of file diff --git a/src/bqsr/aux_arg.h b/src/bqsr/aux_arg.h index 1b48a4d..6541437 100644 --- a/src/bqsr/aux_arg.h +++ b/src/bqsr/aux_arg.h @@ -43,11 +43,14 @@ struct AuxVar { vector vcfArr; // 从vcf中获取已知位点 PerReadCovariateMatrix readCovariates; // 每个read对应的协变量矩阵 - RecalTables recalTables; // 每个线程对应一个recal tables + RecalTables recalTables; // 每个线程对应一个recal tables,计算table的时候需要每个线程都有一个,输出bam的时候不需要,因为只读取,不写入 SamData sd; StableArray isSNP, isIns, isDel; // 该位置是否是SNP, indel位置,0不是,1是 StableArray baqArray; StableArray snpErrors, insErrors, delErrors; StableArray skips; // 该位置是否是已知位点 + + // only for apply bqsr + RecalTables* bqsrTable; // 保留一份就够 }; \ No newline at end of file diff --git a/src/bqsr/bqsr_args.h b/src/bqsr/bqsr_args.h index feb4299..8971bf8 100644 --- a/src/bqsr/bqsr_args.h +++ b/src/bqsr/bqsr_args.h @@ -11,6 +11,8 @@ Date : 2025/10/10 #include #include +#include "qual_utils.h" + using std::string; using std::vector; @@ -36,19 +38,19 @@ struct BQSRArg { /* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */ bool CREATE_INDEX = true; + string INDEX_FILE; + nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI; /* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/ bool ADD_PG_TAG_TO_READS = true; - // + // 命令行字符串 string CLI_STR; - // + // 开始运行时间 string START_TIME; - string PROGRAM_RECORD_ID = "FastBQSR"; - // reference file string REFERENCE_FILE; @@ -137,14 +139,6 @@ struct BQSRArg { */ double BAQGOP = 40; - /** - * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in - * the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since base callers use these - * values to indicate random or bad bases. For example, Illumina writes Q2 bases when the machine has really gone wrong. - * This would be fine in and of itself, but when you select a subset of these reads based on their ability to align to the - * reference and their dinucleotide effect, your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. - */ - int PRESERVE_QSCORES_LESS_THAN = 6; /** * enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了,用不到了? @@ -152,7 +146,7 @@ struct BQSRArg { bool enableBAQ = false; /** - * compute-indel-bqsr-tables, compute indel BQSR tables" + * compute-indel-bqsr-tables, compute indel BQSR tables" hidden */ bool computeIndelBQSRTables = false; @@ -162,16 +156,93 @@ struct BQSRArg { // // -------------------------------------------------------------------------------------------------------------- - /** - * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which - * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ - * tag is present for a read, the standard qual score will be used. - */ - bool useOriginalBaseQualities = false; /** * If reads are missing some or all base quality scores, this value will be used for all base quality scores. * By default this is set to -1 to disable default base quality assignment. */ int8_t defaultBaseQualities = -1; + + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + // args for apply bqsr + string BQSR_FILE; + + /** + * Turns on the base quantization module. It requires a recalibration report. + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using that many levels. + * Negative values mean that we should quantize using the recalibration report's quantization level. + */ + // @Argument(fullName = "quantize-quals", doc = "Quantize quality scores to a given number of levels", optional = true) + int quantizationLevels = 0; + + /** + * Static quantized quals are entirely separate from the quantize_qual option which uses dynamic binning. + * The two types of binning should not be used together. + * + * For example, the Warp germline pipeline uses the static bins { 10, 20, 30, 40 } + */ + //@Advanced @Argument(fullName = static-quantized-quals, + // doc = "Use static quantized quality scores to a given number of levels (with -" + + // StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME + ")", optional = true, mutex = "quantize-quals") + vector staticQuantizationQuals; + + /** + * Round down quantized only works with the static_quantized_quals option, and should not be used with + * the dynamic binning option provided by quantize_quals. When roundDown = false, rounding is done in + * probability space to the nearest bin. When roundDown = true, the value is rounded to the nearest bin + * that is smaller than the current bin. + */ + // @Advanced @Argument(fullName = "round-down-quantized", doc = "Round quals down to nearest quantized qual", optional = true, + // mutex = "quantize-quals") public + bool roundDown = false; + + /** + * The tool is capable of writing out the original quality scores of each read in the recalibrated output file + * under the "OQ" tag. By default, this behavior is disabled because emitting original qualities results in a + * significant increase of the file size. Use this flag to turn on emission of original qualities. + */ + //@Argument(fullName = "emit-original-quals", doc = "Emit original base qualities under the OQ tag", + // optional = true) + bool emitOriginalQuals = false; + + /** + * If specified, the value of this argument will be used as a flat prior for all mismatching quality scores instead + * of the reported quality score (assigned by the sequencer). + */ + // @Argument(fullName = "global-qscore-prior", doc = "Global Qscore Bayesian prior to use for BQSR", + // optional = true) + double globalQScorePrior = -1.0; + + /** + * If set to true, do not throw an error upon encountering a read with a read group that's not in the recalibration table. + * Instead, simply set the quantized original base qualities as the recalibrated base qualities. + */ + // @Argument(fullName = allow-missing-read-group, doc = "Do not throw an error when encountering a read group not in the recal table", + // optional = true) + bool allowMissingReadGroups = false; + + /** + * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out + * unmodified in the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since + * base callers use these values to indicate random or bad bases. For example, Illumina writes Q2 bases when the + * machine has really gone wrong. This would be fine in and of itself, but when you select a subset of these reads + * based on their ability to align to the reference and their dinucleotide effect, your Q2 bin can be elevated to + * Q8 or Q10, leading to issues downstream. + */ + // @Argument(fullName = "preserve-qscores-less-than", doc = "Don't recalibrate bases with quality scores less than this threshold", optional = true, + // minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) + int PRESERVE_QSCORES_LESS_THAN = QualityUtils::MIN_USABLE_Q_SCORE; + + /** + * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which + * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ + * tag is present for a read, the standard quality score will be used. + */ + // @Argument(fullName = use-original-qualities, shortName = "OQ", doc = "Use the base quality scores from the OQ tag", + // optional = true) + bool useOriginalBaseQualities = false; }; \ No newline at end of file diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index ee2d591..cce2f8b 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -1,13 +1,12 @@ /* -Description: -bam,bam,bam +Description: bqsr的第一阶段,生成协变量统计表的程序入口 Copyright : All right reserved by ICT Author : Zhang Zhonghai -Date : 2023/10/23 +Date : 2025/11/23 */ -#include +#include // in htslib #include #include #include @@ -44,11 +43,10 @@ Date : 2023/10/23 #include "util/stable_array.h" #include "util/utils.h" #include "util/vcf_parser.h" +#include "common_data.h" using std::deque; -#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M - namespace nsgv { // 全局变量 for bqsr @@ -58,29 +56,6 @@ sam_hdr_t* gInBamHeader; // input BAM header vector gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个 }; // namespace nsgv - -// 过滤掉bqsr过程不符合要求的bam数据 -bool bqsrReadFilterOut(const bam1_t *b) { - // 过滤掉unmapped的read - if (b->core.qual == 0) // mapping quality 0 - return true; - if (b->core.qual == 255) // mapping quality not available - return true; - if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped - return true; - } - if (b->core.flag & BAM_FSECONDARY) { // secondary alignment - return true; - } - if (b->core.flag & BAM_FDUP) { // secondary alignment - return true; - } - if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls - return true; - } - return false; -} - // 数据总结 void collapseQualityScoreTableToReadGroupTable(Array2D &byReadGroupTable, Array3D &byQualTable) { // 遍历quality table @@ -137,7 +112,7 @@ static void printRecalTables(const RecalTables& rt) { // 串行bqsr int SerialBQSR(AuxVar &aux) { BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); - inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut); int64_t readNumSum = 0; int round = 0; @@ -227,7 +202,7 @@ int SerialBQSR(AuxVar &aux) { // 8. 计算这条read对应的协变量 PROF_START(GP_covariate); - CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, 0); + CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0); PROF_GP_END(GP_covariate); // fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len); @@ -251,7 +226,7 @@ int SerialBQSR(AuxVar &aux) { // 9. 计算这条read需要跳过的位置 PROF_START(GP_read_vcf); - RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, 0); + RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, RecalFuncs::MAX_SITES_INTERVAL, skips, 0); for (int ii = 0; ii < sd.read_len; ++ii) { skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; @@ -336,6 +311,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) { int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size()); #endif // spdlog::info("tid {}, index {}, steal {}", tid, idx, steal); + // spdlog::info("interval span: {}", bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos()); + int sitesStride = bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos(); + // sitesStride = sitesStride >= RecalFuncs::MAX_SITES_INTERVAL ? sitesStride : RecalFuncs::MAX_SITES_INTERVAL; + sitesStride = RecalFuncs::MAX_SITES_INTERVAL; aux.threadProcessedReads += stopIdx - startIdx; for (int i = startIdx; i < stopIdx; ++i) { // spdlog::info("Thread {} processing read idx: {}", tid, i); @@ -364,10 +343,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) { if (!baqCalculated) continue; PROF_START(TP_covariate); - CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, thid); + CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, thid); PROF_TP_END(TP_covariate); - RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, thid); + RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, sitesStride, skips, thid); for (int ii = 0; ii < sd.read_len; ++ii) { skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; @@ -390,7 +369,7 @@ static void thread_worker(void* data, long idx, int thid, int steal) { // 并行bqsr int ParallelBQSR(vector& auxArr) { BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); - inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut); int64_t readNumSum = 0; int round = 0; @@ -484,8 +463,8 @@ static void globalInit() { /* 并行读取bam数据 */ htsThreadPool htsPoolRead = {NULL, 0}; // - int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, 8); - htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS); + int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD); + htsPoolRead.pool = hts_tpool_init(readThreadNum); if (!htsPoolRead.pool ) { spdlog::error("[{}] failed to set up thread pool", __LINE__); sam_close(nsgv::gInBamFp); @@ -549,9 +528,10 @@ static void globalInit() { // 全局资源释放 static void globalDestroy() { close_debug_files(); + sam_close(nsgv::gInBamFp); } -// entrance of mark BQSR +// entrance of BQSR phase-1 int BaseRecalibrator() { int ret = 0; @@ -562,7 +542,6 @@ int BaseRecalibrator() { else ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal table globalDestroy(); - sam_close(nsgv::gInBamFp); PROF_GP_END(GP_whole_process); return ret; diff --git a/src/bqsr/common_data.h b/src/bqsr/common_data.h new file mode 100644 index 0000000..38ba542 --- /dev/null +++ b/src/bqsr/common_data.h @@ -0,0 +1,30 @@ +/* + Description: 共用的一些宏定义,全局变量等 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2026/01/01 +*/ + +#pragma once + +#include "bqsr_args.h" +#include +#include "aux_arg.h" +#include "fastbqsr_version.h" + +#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M +#define BAM_READ_MAX_THREAD 8 +#define BAM_WRITE_MAX_THREAD 16 + +#define PROGRAM_NAME "FastBQSR" + +namespace nsgv { + +// 全局变量 for bqsr +extern BQSRArg gBqsrArg; // bqsr arguments +extern samFile* gInBamFp; // input BAM file pointer +extern sam_hdr_t* gInBamHeader; // input BAM header +extern vector gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个 +}; // namespace nsgv diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index 6a7e121..266c632 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -250,19 +250,20 @@ void ContextCovariate::GetStrandedClippedBytes(SamData& sd, StableArray& c * @param end the end position in the array (exclusive) * @return the key representing the dna sequence */ -int ContextCovariate::KeyFromContext(const StableArray& dna, const int start, const int end) { - int key = end - start; - int bitOffset = LENGTH_BITS; - for (int i = start; i < end; i++) { - const int baseIndex = baseIndexMap[dna[i] & 0xff]; - if (baseIndex == -1) { // ignore non-ACGT bases - return -1; - } - key |= (baseIndex << bitOffset); - bitOffset += 2; - } - return key; -} +// template // 好像是类的模板函数必须在头文件里实现 +// int ContextCovariate::KeyFromContext(const Arr& dna, const int start, const int end) { +// int key = end - start; +// int bitOffset = LENGTH_BITS; +// for (int i = start; i < end; i++) { +// const int baseIndex = baseIndexMap[dna[i] & 0xff]; +// if (baseIndex == -1) { // ignore non-ACGT bases +// return -1; +// } +// key |= (baseIndex << bitOffset); +// bitOffset += 2; +// } +// return key; +// } /** * For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). @@ -334,6 +335,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const StableArray& bas void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { const int originalReadLength = sd.read_len; + const char* qname = bam_get_qname(sd.bw->b); // store the original bases and then write Ns over low quality ones auto &strandedClippedBases = sd.strandedClippedBases; strandedClippedBases.copy(sd.bases); @@ -348,13 +350,13 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar // since the context covariate may not span the entire set of values in read covariates // due to the clipping of the low quality bases // 这段代码应该不会执行,因为clip with N不会改变read长度 - if (readLengthAfterClipping != originalReadLength) { - // don't bother zeroing out if we are going to overwrite the whole array - for (int i = 0; i < originalReadLength; i++) { - // this base has been clipped off, so zero out the covariate values here - CovariateUtils::SetContext(0, 0, 0, i, values); - } - } + // if (readLengthAfterClipping != originalReadLength) { + // // don't bother zeroing out if we are going to overwrite the whole array + // for (int i = 0; i < originalReadLength; i++) { + // // this base has been clipped off, so zero out the covariate values here + // CovariateUtils::SetContext(0, 0, 0, i, values); + // } + // } const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag(); // Note: duplicated the loop to avoid checking recordIndelValues on each iteration diff --git a/src/bqsr/covariate.h b/src/bqsr/covariate.h index 800ac19..a25b769 100644 --- a/src/bqsr/covariate.h +++ b/src/bqsr/covariate.h @@ -27,7 +27,7 @@ using std::vector; // 协变量的值, 4个协变量 struct CovariateValues { - int readGroup = 0; + int readGroup = 0; // 默认是read group 0 int baseQuality = 0; int context = -1; int cycle = -1; @@ -61,6 +61,16 @@ struct EventType { static EventTypeValue BASE_INSERTION; static EventTypeValue BASE_DELETION; static vector EVENTS; + static int GetIndexForEventRep(char eventType) { + if (eventType == BASE_SUBSTITUTION.representation) { + return BASE_SUBSTITUTION.index; + } else if (eventType == BASE_INSERTION.representation) { + return BASE_DELETION.index; + } else if (eventType == BASE_DELETION.representation) { + return BASE_DELETION.index; + } + return 0; + } }; // Read group协变量 @@ -201,7 +211,22 @@ struct ContextCovariate { // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) static void GetStrandedClippedBytes(SamData& ad, StableArray& clippedBases, uint8_t lowQTail); // Creates a int representation of a given dna string. - static int KeyFromContext(const StableArray& dna, const int start, const int end); + template + static int KeyFromContext(const Arr& dna, const int start, const int end) { + int key = end - start; + int bitOffset = LENGTH_BITS; + for (int i = start; i < end; i++) { + const int baseIndex = baseIndexMap[dna[i] & 0xff]; + if (baseIndex == -1) { // ignore non-ACGT bases + return -1; + } + key |= (baseIndex << bitOffset); + bitOffset += 2; + } + return key; + } + + static int KeyFromContext(const string& dna) { return KeyFromContext(dna, 0, dna.size()); } // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). static void GetReadContextAtEachPosition(const StableArray& bases, const int contextSize, const int mask, StableArray& keys); @@ -219,6 +244,7 @@ struct CycleCovariate { static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; } + static int KeyFromCycle(const int cycle) { return KeyFromCycle(cycle, MAXIMUM_CYCLE_VALUE); } /** * Encodes the cycle number as a key. */ diff --git a/src/bqsr/read_utils.h b/src/bqsr/read_utils.h new file mode 100644 index 0000000..0d817a8 --- /dev/null +++ b/src/bqsr/read_utils.h @@ -0,0 +1,71 @@ +/* + Description: apply bqsr过程中对sam的一些修改函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2026/01/02 +*/ +#pragma once + +#include +#include + +struct ReadUtils { + // sam/bam文件中的quality score要在原来的quality上加33,应该是映射到字符区,方便用字母表示 + // htslib解析之后已经减掉了,就是真实的质量分数 + static constexpr int QUALITY_SCORE_ADD_IN_FILE = 33; + /** + * Resets the quality scores of the reads to the orginal (pre-BQSR) ones. + */ + static void resetOriginalBaseQualities(bam1_t *b) { + uint8_t* oq = bam_aux_get(b, "OQ"); + char* oqVal = nullptr; + if (oq) + oqVal = bam_aux2Z(oq); + int crg = 0; + if (oqVal == nullptr) { + } else { + uint8_t* quals = bam_get_qual(b); + for (int i = 0; i < b->core.l_qseq; ++i) { + quals[i] = oqVal[i] - QUALITY_SCORE_ADD_IN_FILE; // 要减掉33 + } + } + } + + // get read group id + static const char* getReadGroupId(bam1_t *b) { + uint8_t* rgStr = bam_aux_get(b, "RG"); + char* rgVal = nullptr; + if (rgStr) + rgVal = bam_aux2Z(rgStr); + return rgVal; + } + + // 将当前quals保存在OQ tag中 (如果当前read不存在OQ tag的情况下) + static void setOriginalBaseQualsIfNoOQ(bam1_t *b) { + uint8_t* tagData = bam_aux_get(b, "OQ"); + if (tagData) + return; + char* tagVal = nullptr; + const char* oldQual = (char*)bam_get_qual(b); + string qual(b->core.l_qseq + 1, '\0'); + for (int i = 0; i < qual.size() - 1; ++i) { + qual[i] = oldQual[i] + QUALITY_SCORE_ADD_IN_FILE; + } + // bam_aux_append 最后一个字符必须是'\0' + if (bam_aux_append(b, "OQ", 'Z', qual.size(), (const uint8_t*)qual.c_str()) != 0) { + spdlog::error("Add OQ (original quality score) tag failed. "); + } + } + + // 移除给定的tag + static void removeAttribute(bam1_t *b, const string &tag) { + uint8_t* tagData = bam_aux_get(b, "OQ"); + if (!tagData) + return; + if (bam_aux_remove(b, tagData) == nullptr) { + spdlog::error("Remove tag {} failed. ", tag); + } + } +}; \ No newline at end of file diff --git a/src/bqsr/recal_datum.h b/src/bqsr/recal_datum.h index 76199cb..2339c25 100644 --- a/src/bqsr/recal_datum.h +++ b/src/bqsr/recal_datum.h @@ -147,14 +147,18 @@ struct RecalDatum { */ inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); } inline double getNumMismatches() const { return numMismatches / MULTIPLIER; } + inline void setNumMismatches(double val) { numMismatches = val * MULTIPLIER; } inline uint64_t getNumObservations() const { return numObservations; } + inline void setNumObservations(uint64_t val) { numObservations = val; } inline double getReportedQuality() const { return reportedQuality; } + inline void setReportedQuality(double val) { reportedQuality = val; } /** * Computes the empirical quality of the datum, using the reported quality as the prior. * @see #getEmpiricalQuality(double) below. */ double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); } + inline void setEmpiricalQuality(double val) { empiricalQuality = val; } /** * Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum. diff --git a/src/bqsr/recal_funcs.h b/src/bqsr/recal_funcs.h index 6b23da0..bc81d8b 100644 --- a/src/bqsr/recal_funcs.h +++ b/src/bqsr/recal_funcs.h @@ -8,19 +8,55 @@ */ #pragma once -#include "util/stable_array.h" -#include "util/sam_data.h" +#include +#include + #include "bqsr/aux_arg.h" #include "util/bam_wrap.h" -#include "util/interval.h" -#include "util/vcf_parser.h" -#include "util/profiling.h" #include "util/debug.h" +#include "util/interval.h" +#include "util/profiling.h" +#include "util/sam_data.h" +#include "util/stable_array.h" +#include "util/vcf_parser.h" struct RecalFuncs { - static constexpr int MAX_SITES_INTERVAL = 100000; + //static constexpr int MAX_SITES_INTERVAL = 100000; + static constexpr int MAX_SITES_INTERVAL = 21500; // 经验值,这个数读取vcf和计算的性能最好 static constexpr uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@'; + // 过滤掉bqsr过程不符合要求的bam数据 + static bool bqsrReadFilterOut(const bam1_t* b) { + // 过滤掉unmapped的read + if (b->core.qual == 0) // mapping quality 0 + return true; + if (b->core.qual == 255) // mapping quality not available + return true; + if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped + return true; + } + if (b->core.flag & BAM_FSECONDARY) { // secondary alignment + return true; + } + if (b->core.flag & BAM_FDUP) { // secondary alignment + return true; + } + if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls + return true; + } + return false; + } + + // 过滤掉apply bqsr过程不符合要求的bam数据 + static bool applyBqsrReadFilterOut(const bam1_t* b) { + // 好像4.6.2的GATK版本的welformed filter啥也没过滤 + // b->core.flag & BAM_FUNMAP || + if ((b->core.tid == -1 || b->core.pos == -1) && !(b->core.flag & BAM_FMUNMAP)) { // unmapped + return true; + } + return false; + } + // 设置某个位置是indel static inline void updateIndel(StableArray& isIndel, int index) { if (index >= 0 && index < isIndel.size()) { @@ -94,18 +130,16 @@ struct RecalFuncs { } // 获取一行字符串 - static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) { - line->clear(); - if (*cur >= total) + static void get_line_from_buf(char* buf, int64_t total, int64_t* cur) { + if (*cur >= total) { + (*cur)++; return; - char b; - while (*cur < total && (b = buf[(*cur)++]) != '\n') { - line->push_back(b); } + while (*cur < total && buf[(*cur)++] != '\n'); } // 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 - static void calculateKnownSites(SamData& sd, vector& vcfs, sam_hdr_t* samHdr, StableArray& knownSites, int thid) { + static void calculateKnownSites(SamData& sd, vector& vcfs, sam_hdr_t* samHdr, int sitesStride, StableArray& knownSites, int thid) { BamWrap* bw = sd.bw; int tid = bw->contig_id(); int64_t startPos = bw->start_pos(); // 闭区间,使用clip之前的read匹配的范围 @@ -113,7 +147,6 @@ struct RecalFuncs { knownSites.resize_fill(sd.read_len, 0); // update vcfs - // int idx = 0; PROF_START(TP_read_vcf); for (auto& vcf : vcfs) { if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam,不需要读取 @@ -130,7 +163,7 @@ struct RecalFuncs { vcf.knownSites.clear(); // 清空,因为后面会读入覆盖bam的所有vcf位点 // 读取新的interval int64_t fpos, flen; - endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos); + endPos = std::max(startPos + sitesStride, endPos); Interval readIntv(startPos, endPos); vcf.index.SearchInterval(startPos, endPos, &fpos, &flen); // fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen); @@ -143,10 +176,12 @@ struct RecalFuncs { } char* buf = vcf.buf; vcf.inStm.read(buf, flen); - string line; int64_t cur = 0; - get_line_from_buf(buf, flen, &cur, &line); - while (line.size() > 0) { + int64_t start = 0; + get_line_from_buf(buf, flen, &cur); + while (cur > start + 1) { + const string line(buf + start, buf + cur - 1); + // spdlog::info("s: {}, e: {}, line: {}", start, cur, line); stringstream ss_line(line); string stid; int tid, pos; @@ -159,7 +194,8 @@ struct RecalFuncs { if (varIntv.overlaps(readIntv)) { vcf.knownSites.push_back(varIntv); // 闭区间 } - get_line_from_buf(buf, flen, &cur, &line); + start = cur; + get_line_from_buf(buf, flen, &cur); } } } @@ -195,7 +231,6 @@ struct RecalFuncs { knownSites[i] = true; } } - //idx += 1; } PROF_TP_END(TP_calc_skips); //fprintf(gf[0], "\n"); @@ -241,4 +276,53 @@ struct RecalFuncs { calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs); } } + + + // for apply bqsr + /** + * Quality score recalibration algorithm works as follows: + * - Start with the (approximate, or "estimated") reported quality score. (Approximation occurs when marginalizing/collapsing + * over the reported qualities for each read group). + * - Compute (retrieve?) the empirical quality score using the per-read group datum (i.e. counts). Call it y_1. + * - Use y_1 just computed as the prior for the empirical quality score for the datum for the 2-tuple ( read group, quality score). Call it y_2. + * - Use y_2 as the prior to compute the empirical quality for the 3-tuple ( read-group, quality-score, special covariate ). Call it y_3 for the + * context covariate. Similarly define y_4 for the cycle covariate. Let d_3 = y_3 - y_2, d_4 = y_4 - y_2. + * - (final recalibrated score) = y_2 + d_3 + d_4 = y_3 + y_4 - y_2. + * + * @param priorQualityScore the prior quality score (in log space). It is either the "estimated" or collapsed reported quality score + * for the read group, or the constant prior if given. This value has type double because of the "combine" (or collapse) + * operation that collapses the quality scores represented within the same read group. + * @param readGroupDatum the RecalDatum object for a particular read group at hand. May be null. + * @param qualityScoreDatum the RecalDatum object for a particular (read group, reported quality) tuple at hand. May be null. + * @param specialCovariateDatums the array of RecalDatum objects for the non-required covariates (cycle and context covariates by default). + * @return + */ + static double hierarchicalBayesianQualityEstimate(double priorQualityScore, RecalDatum& readGroupDatum, RecalDatum& qualityScoreDatum, + RecalDatum& contextDatum, RecalDatum& cycleDatum) { + double empiricalQualityForReadGroup = + readGroupDatum.getNumObservations() == 0 ? priorQualityScore : readGroupDatum.getEmpiricalQuality(priorQualityScore); + double posteriorEmpiricalQualityForReportedQuality = qualityScoreDatum.getNumObservations() == 0 + ? empiricalQualityForReadGroup + : qualityScoreDatum.getEmpiricalQuality(empiricalQualityForReadGroup); + + double deltaSpecialCovariates = 0.0; + // At this point we stop being iterative; the special covariates (context and cycle by default) are treated differently. + if (contextDatum.getNumObservations() > 0) { + // TODO: the prior is ignored if the empirical quality for the datum is already cached. + deltaSpecialCovariates += + contextDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality; + } + if (cycleDatum.getNumObservations() > 0) { + // TODO: the prior is ignored if the empirical quality for the datum is already cached. + deltaSpecialCovariates += + cycleDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality; + } + + return posteriorEmpiricalQualityForReportedQuality + deltaSpecialCovariates; + } + + // recalibrated quality is bound between 1 and MAX_QUAL + static uint8_t getBoundedIntegerQual(double recalibratedQualDouble) { + return QualityUtils::boundQual(MathUtils::fastRound(recalibratedQualDouble), RecalDatum::MAX_RECALIBRATED_Q_SCORE); + } }; \ No newline at end of file diff --git a/src/bqsr/recal_tables.h b/src/bqsr/recal_tables.h index d342791..103cf4f 100644 --- a/src/bqsr/recal_tables.h +++ b/src/bqsr/recal_tables.h @@ -11,9 +11,10 @@ #include "covariate.h" #include "nested_array.h" #include "recal_datum.h" +#include "qual_utils.h" struct RecalTables { - int qualDimension = 94; // MAX_SAM_QUAL_SCORE(93) + 1 + int qualDimension = QualityUtils::MAX_SAM_QUAL_SCORE + 1; // MAX_SAM_QUAL_SCORE(93) + 1 int eventDimension = EventType::EVENT_SIZE; int numReadGroups; @@ -41,4 +42,22 @@ struct RecalTables { contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1); cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1); } + + void initReadGroupTable(int _numReadGroups) { + numReadGroups = _numReadGroups; + readGroupTable.init(eventDimension, numReadGroups); + } + + // 必须在调用initReadGroupTable之后,或者设置了numReadGroups之后 + void initQualityScoreTable() { + qualityScoreTable.init(eventDimension, numReadGroups, qualDimension); + } + + void initContextTable() { + contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1); + } + + void initCycleTable() { + cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1); + } }; \ No newline at end of file diff --git a/src/bqsr/recal_utils.h b/src/bqsr/recal_utils.h index e9539ab..15ad673 100644 --- a/src/bqsr/recal_utils.h +++ b/src/bqsr/recal_utils.h @@ -11,6 +11,8 @@ #include #include +#include +#include #include "bqsr_args.h" #include "nested_array.h" @@ -79,25 +81,6 @@ struct RecalUtils { } } } - // fprintf(gf[4], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1); - // _Foreach3D(qualityScoreTable, val, { - // if (val.numObservations > 0) - // fprintf(gf[4], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality); - // }); - // fprintf(gf[4], "\n"); - - // fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1); - // for (auto& arr1 : contextTable.data) { - // for (size_t si = 0; si < arr1.size(); ++si) { - // for (auto &arr2 : arr1[si]) { - // for (auto& val : arr2) { - // if (val.numObservations > 0) - // fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality); - // } - // } - // } - // } - // fprintf(gf[3], "\n"); } // 输出bqsr报告 @@ -110,7 +93,7 @@ struct RecalUtils { exit(1); } // 输出version信息 - fprintf(fpout, "%s\n", REPORT_HEADER_VERSION); + fprintf(fpout, "%s:%s\n", REPORT_HEADER_VERSION, REPORT_HEADER_MINOR_VERSION); // 输出参数信息 outputArgsTable(RAC, fpout); // 输出量化质量分数信息 @@ -268,4 +251,220 @@ struct RecalUtils { }); table.write(fpout); } + + // 解析bqsr table文件 + static void readRecalTables(const string& recalTableFn, BQSRArg& RAC, vector& quantizedScore, RecalTables& recalTables) { + std::ifstream infs(recalTableFn, ifstream::in); + string line; + // 读取文件头 + std::getline(infs, line); + spdlog::info("header: {}", line); + if (line.find(REPORT_HEADER_VERSION) != 0) { + spdlog::error("BQSR table version not supported! {} {}", line, __LINE__); + exit(1); + } + + // 读取参数表 + readArgsTable(infs, RAC); + + readQuantTable(infs, quantizedScore); + + readReadGroupTable(infs, recalTables); + + readQualityScoreTable(infs, recalTables); + + readContextCycleTable(infs, recalTables); + + infs.close(); + } + + template + static void split(const string& str, Arr& dat) { + dat.clear(); + std::istringstream iss(str); + string item; + while (iss >> item) { + dat.push_back(item); + } + } + + template + static void split(const string& str, const char delimiter, Arr& dat) { + dat.clear(); + std::istringstream iss(str); + string token; + while (std::getline(iss, token, delimiter)) { + dat.push_back(token); + } + } + + // 读取参数表 + static void readArgsTable(std::ifstream& infs, BQSRArg& p) { + string line; + StableArray dat; + std::getline(infs, line); + while (line.empty() && std::getline(infs, line)); // 跳过空行 + split(line, ':', dat); + int argRowCnt = std::stoi(dat[3]); + std::getline(infs, line); // 忽略表格说明的一行 + std::getline(infs, line); // 忽略表头column说明 + for (int i = 0; i < argRowCnt; ++i) { + std::getline(infs, line); + split(line, dat); + if ("covariate" == dat[0]) { + } else if ("no_standard_covs" == dat[0]) { + p.DO_NOT_USE_STANDARD_COVARIATES = ReportUtil::ToBool(dat[1]); + } else if ("run_without_dbsnp" == dat[0]) { + p.RUN_WITHOUT_DBSNP = ReportUtil::ToBool(dat[1]); + } else if ("solid_recal_mode" == dat[0]) { + p.SOLID_RECAL_MODE = ReportUtil::ParseString(dat[1]); + } else if ("solid_nocall_strategy" == dat[0]) { + p.SOLID_NOCALL_STRATEGY = ReportUtil::ParseString(dat[1]); + } else if ("mismatches_context_size" == dat[0]) { + p.MISMATCHES_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]); + } else if ("indels_context_size" == dat[0]) { + p.INDELS_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]); + } else if ("mismatches_default_quality" == dat[0]) { + p.MISMATCHES_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]); + // spdlog::info("int8_t : {}", (int)p.MISMATCHES_DEFAULT_QUALITY); + } else if ("deletions_default_quality" == dat[0]) { + p.DELETIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]); + } else if ("insertions_default_quality" == dat[0]) { + p.INSERTIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]); + } else if ("maximum_cycle_value" == dat[0]) { + p.MAXIMUM_CYCLE_VALUE = ReportUtil::ToInt(dat[1]); + } else if ("low_quality_tail" == dat[0]) { + p.LOW_QUAL_TAIL = ReportUtil::ToInt(dat[1]); + } else if ("default_platform" == dat[0]) { + p.DEFAULT_PLATFORM = ReportUtil::ParseString(dat[1]); + } else if ("force_platform" == dat[0]) { + p.FORCE_PLATFORM = ReportUtil::ParseString(dat[1]); + } else if ("quantizing_levels" == dat[0]) { + p.QUANTIZING_LEVELS = ReportUtil::ToInt(dat[1]); + } else if ("recalibration_report" == dat[0]) { + p.existingRecalibrationReport = ReportUtil::ParseString(dat[1]); + } else if ("binary_tag_name" == dat[0]) { + p.BINARY_TAG_NAME = ReportUtil::ParseString(dat[1]); + // spdlog::info("BINARY_TAG_NAME : {}", p.BINARY_TAG_NAME); + } else { + spdlog::error("unknown argument : {} {}", dat[0], dat[1]); + } + } + } + + // 读取量化质量分数信息表 + static void readQuantTable(std::ifstream& infs, vector& quantizedScore) { + string line; + StableArray dat; + std::getline(infs, line); + while (line.empty() && std::getline(infs, line)); // 跳过空行 + split(line, ':', dat); + int argRowCnt = std::stoi(dat[3]); + std::getline(infs, line); // 忽略表格说明的一行 + std::getline(infs, line); // 忽略表头column说明 + quantizedScore.resize(argRowCnt); + for (int i = 0; i < argRowCnt; ++i) { + std::getline(infs, line); + // spdlog::info("{}", line); + split(line, dat); + quantizedScore[ReportUtil::ToInt(dat[0])] = ReportUtil::ToInt(dat[2]); + // spdlog::info("{}, {}", ReportUtil::ToInt(dat[0]), quantizedScore[ReportUtil::ToInt(dat[0])]); + } + } + + // 读取read group表 + static void readReadGroupTable(std::ifstream& infs, RecalTables& recalTables) { + string line; + StableArray dat; + std::getline(infs, line); + while (line.empty() && std::getline(infs, line)); // 跳过空行 + split(line, ':', dat); + int argRowCnt = std::stoi(dat[3]); + std::getline(infs, line); // 忽略表格说明的一行 + std::getline(infs, line); // 忽略表头column说明 + vector> dats(argRowCnt); + int rgId = 0; + for (int i = 0; i < argRowCnt; ++i) { + std::getline(infs, line); + split(line, dats[i]); + if (ReadGroupCovariate::RgToId.find(dats[i][0]) == ReadGroupCovariate::RgToId.end()) { + ReadGroupCovariate::RgToId[dats[i][0]] = rgId; + ReadGroupCovariate::IdToRg[rgId] = dats[i][0]; + ++rgId; + } + } + recalTables.initReadGroupTable(rgId); + // spdlog::info("read group num: {}, {}", rgId, ReadGroupCovariate::IdToRg[0]); + for (int i = 0; i < dats.size(); ++i) { + int k2 = ReadGroupCovariate::RgToId[dats[i][0]]; + int k1 = EventType::GetIndexForEventRep(dats[i][1][0]); + auto& datum = recalTables.readGroupTable(k1, k2); + datum.setEmpiricalQuality(ReportUtil::ToDouble(dats[i][2])); + datum.setReportedQuality(ReportUtil::ToDouble(dats[i][3])); + datum.setNumObservations(ReportUtil::ToUint64(dats[i][4])); + datum.setNumMismatches(ReportUtil::ToDouble(dats[i][5])); + } + } + + // 读取质量分数表 + static void readQualityScoreTable(std::ifstream& infs, RecalTables& recalTables) { + string line; + StableArray dat; + std::getline(infs, line); + while (line.empty() && std::getline(infs, line)); // 跳过空行 + split(line, ':', dat); + int argRowCnt = std::stoi(dat[3]); + std::getline(infs, line); // 忽略表格说明的一行 + std::getline(infs, line); // 忽略表头column说明 + recalTables.initQualityScoreTable(); + int rgId = 0; + for (int i = 0; i < argRowCnt; ++i) { + std::getline(infs, line); + split(line, dat); + int k2 = ReadGroupCovariate::RgToId[dat[0]]; + int k3 = ReportUtil::ToInt(dat[1]); + int k1 = EventType::GetIndexForEventRep(dat[2][0]); + auto& datum = recalTables.qualityScoreTable(k1, k2, k3); + datum.setEmpiricalQuality(ReportUtil::ToDouble(dat[3])); + datum.setReportedQuality((double)k3); + datum.setNumObservations(ReportUtil::ToUint64(dat[4])); + datum.setNumMismatches(ReportUtil::ToDouble(dat[5])); + } + } + + // 读取context和cycle表 + static void readContextCycleTable(std::ifstream& infs, RecalTables& recalTables) { + string line; + StableArray dat; + std::getline(infs, line); + while (line.empty() && std::getline(infs, line)); // 跳过空行 + split(line, ':', dat); + int argRowCnt = std::stoi(dat[3]); + std::getline(infs, line); // 忽略表格说明的一行 + std::getline(infs, line); // 忽略表头column说明 + recalTables.initContextTable(); + recalTables.initCycleTable(); + for (int i = 0; i < argRowCnt; ++i) { + std::getline(infs, line); + split(line, dat); + int k2 = ReadGroupCovariate::RgToId[dat[0]]; + int k3 = ReportUtil::ToInt(dat[1]); + int k4 = 0; + int k1 = EventType::GetIndexForEventRep(dat[4][0]); + RecalDatum* dp; + if (dat[3] == "Cycle") { + k4 = CycleCovariate::KeyFromCycle(ReportUtil::ToInt(dat[2])); + dp = &recalTables.cycleTable(k1, k2, k3, k4); + } else if (dat[3] == "Context") { + k4 = ContextCovariate::KeyFromContext(dat[2]); + dp = &recalTables.contextTable(k1, k2, k3, k4); + } else { + spdlog::error("Unknown CovariateName {}", dat[3]); + } + dp->setEmpiricalQuality(ReportUtil::ToDouble(dat[5])); + dp->setReportedQuality((double)k3); + dp->setNumObservations(ReportUtil::ToUint64(dat[6])); + dp->setNumMismatches(ReportUtil::ToDouble(dat[7])); + } + } }; \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 4f943d0..bacea35 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,10 +12,7 @@ #include "fastbqsr_version.h" #include "bqsr/bqsr_args.h" #include "util/profiling.h" - -namespace nsgv { -extern BQSRArg gBqsrArg; -}; +#include "bqsr/common_data.h" int BaseRecalibrator(); int ApplyBQSR(); @@ -30,28 +27,18 @@ string get_current_time_str() { return string(now); } -int main_BaseRecalibrator(int argc, char *argv[]) { - // init arg parser - argparse::ArgumentParser program(nsgv::gBqsrArg.PROGRAM_RECORD_ID, FASTBQSR_VERSION, argparse::default_arguments::none); - program.add_description( - "First pass of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n" - "user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)"); - +// 添加bqsr和apply bqsr的共同使用的参数 +static void addCommonArgs(argparse::ArgumentParser &program, const char *outputDesc) { program.add_argument("--input") .help("BAM/SAM/CRAM file containing reads This argument must be specified at least once.") .metavar("") .required(); program.add_argument("--output") - .help("The output recalibration table file to create.") + .help(outputDesc) .metavar("") .required(); - program.add_argument("--reference") - .help("Reference sequence file.") - .metavar("") - .required(); - program.add_argument("--num-threads") .help("Number of threads to use.") .scan<'i', int>() @@ -59,6 +46,66 @@ int main_BaseRecalibrator(int argc, char *argv[]) { .nargs(1) .metavar(""); + program.add_argument("--reference") + .help("Reference sequence file.") + .metavar("") + .required(); +} + +// 添加帮助和版本信息 +static void addHelpVersion(argparse::ArgumentParser& program) { + program.add_argument("-h", "--help") + .action([&](const auto& /*unused*/) { + std::cout << program.help().str(); + std::exit(0); + }) + .default_value(false) + .help("shows help message and exits") + .implicit_value(true) + .nargs(0); + + program.add_argument("-v", "--version") + .action([&](const auto& /*unused*/) { + std::cout << FASTBQSR_VERSION << std::endl; + std::exit(0); + }) + .default_value(false) + .help("prints version information and exits") + .implicit_value(true) + .nargs(0); +} + +// 添加运行时间信息和命令行参数列表 +static void addTimeCLI(int argc, char* argv[]) { + nsgv::gBqsrArg.START_TIME = get_current_time_str(); + nsgv::gBqsrArg.CLI_STR = argv[0]; + for (int i = 1; i < argc; ++i) { + nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]); + } + // spdlog::info("cmd: {}", nsgv::gBqsrArg.CLI_STR); +} + +// 解析公共参数 +static void parseCommonArgs(argparse::ArgumentParser& program) { + nsgv::gBqsrArg.INPUT_FILE = program.get("--input"); + nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output"); + nsgv::gBqsrArg.NUM_THREADS = program.get("--num-threads"); + if (nsgv::gBqsrArg.NUM_THREADS < 1) { + spdlog::error("num-threads must be positive."); + exit(1); + } + nsgv::gBqsrArg.REFERENCE_FILE = program.get("--reference"); +} + +int main_BaseRecalibrator(int argc, char *argv[]) { + // init arg parser + argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none); + program.add_description( + "First phase of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n" + "user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)"); + + addCommonArgs(program, "The output recalibration table file to create."); + program.add_argument("--known-sites") .help( "One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from " @@ -66,6 +113,59 @@ int main_BaseRecalibrator(int argc, char *argv[]) { .metavar("") .nargs(argparse::nargs_pattern::any); + program.add_argument("--enable-baq") + .help("Whether to do BAQ correction.") + .default_value(false) + .implicit_value(true) + .hidden(); + + program.add_argument("--compute-indel-bqsr-tables") + .help("Whether to compute indel BQSR tables.") + .default_value(false) + .implicit_value(true) + .hidden(); + + // add help and version args + addHelpVersion(program); + + try { + program.parse_args(argc, argv); + parseCommonArgs(program); + + nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get>("--known-sites"); + nsgv::gBqsrArg.enableBAQ = program.get("--enable-baq"); + nsgv::gBqsrArg.computeIndelBQSRTables = program.get("--compute-indel-bqsr-tables"); + // spdlog::info("known sites vcf files:"); + // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { + // spdlog::info(" {}", ks); + // } + } catch (const std::exception &err) { + spdlog::error(err.what()); + return 1; + } + + spdlog::info("fast base recalibration phase-1 start"); + BaseRecalibrator(); + spdlog::info("fast base recalibration phase-1 end"); + + DisplayProfilingBQSR(nsgv::gBqsrArg.NUM_THREADS); + + return 0; +} + +int main_ApplyBQSR(int argc, char* argv[]) { + // init arg parser + argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none); + program.add_description( + "Second phase of the Base Quality Score Recalibration (BQSR) -- Apply the bqsr table to generate recalibrated bam file.)"); + + addCommonArgs(program, "The output bam/sam file to create."); + + program.add_argument("--bqsr") + .help("Input recalibration table for BQSR.") + .metavar("") + .required(); + program.add_argument("--create-index") .help("Whether to create an index when writing coordinate sorted BAM output.") .default_value(false) @@ -78,83 +178,37 @@ int main_BaseRecalibrator(int argc, char *argv[]) { .nargs(1) .metavar(""); - program.add_argument("--enable-baq") - .help("Whether to do BAQ correction.") - .default_value(false) - .implicit_value(true) - .hidden(); - - // add help and version args - program.add_argument("-h", "--help") - .action([&](const auto & /*unused*/) { - std::cout << program.help().str(); - std::exit(0); - }) - .default_value(false) - .help("shows help message and exits") - .implicit_value(true) - .nargs(0); - - program.add_argument("-v", "--version") - .action([&](const auto & /*unused*/) { - std::cout << FASTBQSR_VERSION << std::endl; - std::exit(0); - }) - .default_value(false) - .help("prints version information and exits") - .implicit_value(true) - .nargs(0); - - // std::cout << program << std::endl; - - nsgv::gBqsrArg.START_TIME = get_current_time_str(); - nsgv::gBqsrArg.CLI_STR = argv[0]; - for (int i = 1; i < argc; ++i) { - nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]); - } - + addHelpVersion(program); try { program.parse_args(argc, argv); - nsgv::gBqsrArg.INPUT_FILE = program.get("--input"); - nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output"); - nsgv::gBqsrArg.NUM_THREADS = program.get("--num-threads"); - if (nsgv::gBqsrArg.NUM_THREADS < 1) { - spdlog::error("num-threads must be positive."); - exit(1); - } + parseCommonArgs(program); + nsgv::gBqsrArg.BQSR_FILE = program.get("--bqsr"); nsgv::gBqsrArg.CREATE_INDEX = program.get("--create-index"); - nsgv::gBqsrArg.REFERENCE_FILE = program.get("--reference"); - nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get>("--known-sites"); - nsgv::gBqsrArg.enableBAQ = program.get("--enable-baq"); - // spdlog::info("known sites vcf files:"); - // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { - // spdlog::info(" {}", ks); - // } + string idxFormat = program.get("--index-format"); + std::transform(idxFormat.begin(), idxFormat.end(), idxFormat.begin(), ::toupper); + nsgv::gBqsrArg.INDEX_FORMAT = idxFormat == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI; - nsgv::gBqsrArg.INDEX_FORMAT = - program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI; - - } catch (const std::exception &err) { + } catch (const std::exception& err) { spdlog::error(err.what()); return 1; } - spdlog::info("fast base recalibration phase-1 start"); - BaseRecalibrator(); - spdlog::info("fast base recalibration phase-1 end"); + spdlog::info("fast base recalibration phase-2 start"); + ApplyBQSR(); + spdlog::info("fast base recalibration phase-2 end"); - DisplayProfiling(nsgv::gBqsrArg.NUM_THREADS); + // DisplayProfilingApplyBQSR(nsgv::gBqsrArg.NUM_THREADS); + DisplayProfilingApplyBQSR(1); return 0; } -int main_ApplyBQSR(int argc, char* argv[]) { return 0; } - int main(int argc, char *argv[]) { // init log spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr")); spdlog::cfg::load_env_levels(); + addTimeCLI(argc, argv); string bqsr_prog = argv[1]; if (bqsr_prog == "BaseRecalibrator") { return main_BaseRecalibrator(argc - 1, argv + 1); diff --git a/src/util/profiling.cpp b/src/util/profiling.cpp index 91a4d01..0ae89bf 100644 --- a/src/util/profiling.cpp +++ b/src/util/profiling.cpp @@ -45,8 +45,7 @@ static int CalcThreadTime(uint64_t *a, int len, double *max, double *min, double fprintf(stderr, "time %-15s: avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \ } -int DisplayProfiling(int nthread) { - +int DisplayProfilingBQSR(int nthread) { #ifdef SHOW_PERF fprintf(stderr, "\n"); PRINT_GP(GP_read); @@ -80,6 +79,27 @@ int DisplayProfiling(int nthread) { } PRINT_GP(GP_whole_process); + fprintf(stderr, "\n"); +#endif + + return 0; +} + +int DisplayProfilingApplyBQSR(int nthread) { +#ifdef SHOW_PERF + fprintf(stderr, "\n"); + PRINT_GP(GP_read); + if (nthread == 1) { + PRINT_GP(GP_covariate); + } else { + PRINT_TP(TP_covariate); + PRINT_TP(TP_readgroup); + PRINT_TP(TP_qualityscore); + PRINT_TP(TP_context); + PRINT_TP(TP_cycle); + } + PRINT_GP(GP_whole_process); + fprintf(stderr, "\n"); #endif diff --git a/src/util/profiling.h b/src/util/profiling.h index bc96123..8377479 100644 --- a/src/util/profiling.h +++ b/src/util/profiling.h @@ -79,7 +79,8 @@ enum { uint64_t RealtimeMsec(void); -int DisplayProfiling(int); +int DisplayProfilingBQSR(int); +int DisplayProfilingApplyBQSR(int); #ifdef __cplusplus } diff --git a/src/util/report_table.h b/src/util/report_table.h index 23ae6c0..6d937a4 100644 --- a/src/util/report_table.h +++ b/src/util/report_table.h @@ -22,24 +22,46 @@ using std::string; using std::stringstream; using std::vector; -#define REPORT_HEADER_VERSION "#:GATKReport.v1.1:5" +#define REPORT_HEADER_VERSION "#:GATKReport.v1.1" +#define REPORT_HEADER_MINOR_VERSION "5" struct ReportUtil { - static string ToString(const bool val) { return val ? "true" : "false"; } - static string ToString(const char val) { + static inline string ToString(const bool val) { return val ? "true" : "false"; } + static inline string ToString(const char val) { string s = ""; s += val; // spdlog::info("char: {}, str: {}", val, s); return s; } - static string ToString(const string& val) { return val == "" ? "null" : val; } - static string ToString(const double val, int precise) { + static inline string ToString(const string& val) { return val == "" ? "null" : val; } + static inline string ToString(const double val, int precise) { stringstream ss; ss << std::fixed << std::setprecision(precise) << val; return ss.str(); } template - static string ToString(const T val) { return std::to_string(val); } + static inline string ToString(const T val) { return std::to_string(val); } + + // 转换成bool + static inline bool ToBool(const string &val) { + if (val == "false" || val == "False" || val == "FALSE" || val.empty()) + return false; + return true; + } + + static inline string ParseString(const string &val) { + if (val == "null") + return ""; + return val; + } + + static inline int ToInt(const string& val) { return std::stoi(val); } + + static inline double ToDouble(const string& val) { return std::stod(val); } + + static inline int64_t ToInt64(const string& val) { return std::stoll(val); } + + static inline uint64_t ToUint64(const string& val) { return std::stoull(val); } }; struct ReportTable { diff --git a/src/util/sam_data.h b/src/util/sam_data.h index 9d6dc2c..77eb15b 100644 --- a/src/util/sam_data.h +++ b/src/util/sam_data.h @@ -54,7 +54,8 @@ struct SamData { StableArray base_quals; // 对应的质量分数 StableArray ins_quals; // insert质量分数, BI (大部分应该都没有) StableArray del_quals; // delete质量分数, BD (大部分应该都没有) - StableArray cigars; + StableArray cigars; // clip 之后的cigar + StableArray recaled_quals; // 保存校正之后的质量分数 // 用作临时buffer StableArray strandedClippedBases; // for context covariate @@ -87,6 +88,20 @@ struct SamData { end_pos = 0; } + // 解析apply bqsr里用到的信息 + void parseForApplyBQSR(BamWrap *_bw) { + bw = _bw; + read_len = bw->b->core.l_qseq; + bases.resize(read_len); + base_quals.resize(read_len); + uint8_t* seq = bam_get_seq(bw->b); + uint8_t* quals = bam_get_qual(bw->b); + for (int i = 0; i < read_len; ++i) { + bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i)]; + base_quals[i] = quals[i]; + } + } + // 初步解析bam void parseBasic(BamWrap *_bw) { bw = _bw;