diff --git a/.gitignore b/.gitignore index 6b16b43..453cda5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ *.d /.vscode /build -/text +/test build.sh run.sh test.sh diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 35f5e43..47bb5df 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,7 @@ set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin") # source codes path aux_source_directory(${PROJECT_SOURCE_DIR}/src MAIN_SRC) aux_source_directory(${PROJECT_SOURCE_DIR}/src/util UTIL_SRC) +aux_source_directory(${PROJECT_SOURCE_DIR}/src/util/math UTIL_MATH_SRC) aux_source_directory(${PROJECT_SOURCE_DIR}/src/bqsr BQSR_SRC) set(KTHREAD_FILE ${PROJECT_SOURCE_DIR}/ext/klib/kthread.c) @@ -20,7 +21,12 @@ link_directories("${PROJECT_SOURCE_DIR}/ext/htslib") set(PG_NAME "fastbqsr") # dependency files -add_executable(${PG_NAME} ${MAIN_SRC} ${UTIL_SRC} ${BQSR_SRC} ${KTHREAD_FILE}) +add_executable(${PG_NAME} + ${MAIN_SRC} + ${UTIL_SRC} + ${UTIL_MATH_SRC} + ${BQSR_SRC} + ${KTHREAD_FILE}) # link htslib target_link_libraries(${PG_NAME} libhts.a) diff --git a/src/bqsr/bqsr_args.h b/src/bqsr/bqsr_args.h index dd3bd08..3f5a51a 100644 --- a/src/bqsr/bqsr_args.h +++ b/src/bqsr/bqsr_args.h @@ -56,6 +56,26 @@ struct BQSRArg { // end of common parameters + // We always use the same covariates. The field is retained for compatibility with GATK3 reports. + bool DO_NOT_USE_STANDARD_COVARIATES = false; + + //It makes no sense to run BQSR without sites. so we remove this option. + bool RUN_WITHOUT_DBSNP = false; + + // We don't support SOLID. The field is retained for compatibility with GATK3 reports. + string SOLID_RECAL_MODE = "SET_Q_ZERO"; + string SOLID_NOCALL_STRATEGY = "THROW_EXCEPTION"; + + //@Hidden @Argument(fullName = "default-platform", optional = true, + // doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String + string DEFAULT_PLATFORM = ""; + + // @Hidden @Argument(fullName = "force-platform", optional = true, + // doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and "solid.") + string FORCE_PLATFORM = ""; + + string existingRecalibrationReport = ""; + /** * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be * between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index d39e5fa..90e9338 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -7,18 +7,18 @@ Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/23 */ +#include #include #include #include #include #include -#include #include #include #include -#include #include +#include #include "baq.h" #include "bqsr_args.h" @@ -28,10 +28,16 @@ Date : 2023/10/23 #include "dup_metrics.h" #include "fastbqsr_version.h" #include "read_name_parser.h" +#include "read_recal_info.h" +#include "recal_datum.h" +#include "recal_tables.h" +#include "recal_utils.h" #include "util/interval.h" +#include "util/linear_index.h" #include "util/profiling.h" #include "util/utils.h" -#include "util/linear_index.h" +#include "util/math/math_utils.h" +#include "quant_info.h" using std::deque; @@ -89,7 +95,8 @@ BQSRArg gBqsrArg; // bqsr arguments samFile* gInBamFp; // input BAM file pointer sam_hdr_t* gInBamHeader; // input BAM header vector gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个 - +RecalTables gRecalTables; // 记录bqsr所有的数据,输出table结果 +vector gEventTypes; // 需要真正计算的eventtype // 下面是需要删除或修改的变量 std::vector gNameParsers; // read name parser @@ -258,12 +265,12 @@ void calculateRefOffset(BamWrap *bw, SamData &ad) { // 计算clip处理之后,剩余的碱基 void calculateReadBases(BamWrap* bw, SamData& ad) { ad.bases.resize(ad.read_len); - ad.quals.resize(ad.read_len); + ad.base_quals.resize(ad.read_len); uint8_t* seq = bam_get_seq(bw->b); uint8_t* quals = bam_get_qual(bw->b); for (int i = 0; i < ad.read_len; ++i) { ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)]; - ad.quals[i] = quals[i]; + ad.base_quals[i] = quals[i]; } } @@ -402,6 +409,7 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is for (int j = 0; j < len; ++j) { // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip int snpInt = cBaseToChar[bam_seqi(seq, readPos + ad.left_clip)] == refBases[refPos] ? 0 : 1; + // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); } isSNP[readPos] = snpInt; nEvents += snpInt; readPos++; @@ -429,6 +437,7 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is } } nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0); + // spdlog::info("nEvents: {}", nEvents); //spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0), // std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents); @@ -438,13 +447,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is } // 简单计算baq数组,就是全部赋值为'@' (64) -bool flatBAQArray(BamWrap* bw, SamData& ad, vector& baqArray) { - baqArray.resize(ad.read_len, (int)'@'); +bool flatBAQArray(BamWrap* bw, SamData& ad, vector& baqArray) { + baqArray.resize(ad.read_len, (uint8_t)'@'); return true; } // 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了 -bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector& baqArray) { +bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector& baqArray) { baqArray.resize(ad.read_len, 0); return true; } @@ -563,7 +572,8 @@ static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector& errorArr, vector& baqArr, vector& fracErrs) { +void calculateFractionalErrorArray(vector& errorArr, vector& baqArr, vector& fracErrs) { + // for (auto val : errorArr) { if (val > 0) spdlog::info("snp err val: {}", val); } fracErrs.resize(baqArr.size()); // errorArray和baqArray必须长度相同 const int BLOCK_START_UNSET = -1; @@ -592,6 +602,95 @@ void calculateFractionalErrorArray(vector& errorArr, vector& baqArr, v } } +/** + * Update the recalibration statistics using the information in recalInfo. + * + * Implementation detail: we only populate the quality score table and the optional tables. + * The read group table will be populated later by collapsing the quality score table. + * + * @param recalInfo data structure holding information about the recalibration values for a single read + */ +void updateRecalTablesForRead(ReadRecalInfo &info) { + SamData &read = info.read; + PerReadCovariateMatrix& readCovars = info.covariates; + Array3D& qualityScoreTable = nsgv::gRecalTables.qualityScoreTable; + Array4D& contextTable = nsgv::gRecalTables.contextTable; + Array4D& cycleTable = nsgv::gRecalTables.cycleTable; + + int readLength = read.read_len; + for (int offset = 0; offset < readLength; ++offset) { + if (!info.skips[offset]) { // 不跳过当前位置 + for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) { + // 获取四个值,readgroup / qualityscore / context / cycle + EventTypeValue& event = nsgv::gEventTypes[idx]; + vector& covariatesAtOffset = readCovars[event.index][offset]; + uint8_t qual = info.getQual(event, offset); + double isError = info.getErrorFraction(event, offset); + + int readGroup = covariatesAtOffset[ReadGroupCovariate::index]; + int baseQuality = covariatesAtOffset[BaseQualityCovariate::index]; + + // 处理base quality score协变量 + // RecalUtils::IncrementDatum3keys(qualityScoreTable, qual, isError, readGroup, baseQuality, event.index); + qualityScoreTable[readGroup][baseQuality][event.index].increment(1, isError, baseQuality); + + auto &d = qualityScoreTable[readGroup][baseQuality][event.index]; + // spdlog::info("isError {} : {}, mis {}, obs {}", isError, info.snp_errs[offset], d.numMismatches, d.numObservations); + + // 处理context covariate + int contextCovariate = covariatesAtOffset[ContextCovariate::index]; + if (contextCovariate >= 0) + contextTable[readGroup][baseQuality][contextCovariate][event.index].increment(1, isError, baseQuality); + // RecalUtils::IncrementDatum4keys(nsgv::gRecalTables.contextTable, qual, isError, readGroup, baseQuality, contextCovariate, + // event.index); + // 处理cycle covariate + int cycleCovariate = covariatesAtOffset[CycleCovariate::index]; + if (cycleCovariate >= 0) + cycleTable[readGroup][baseQuality][cycleCovariate][event.index].increment(1, isError, baseQuality); + // RecalUtils::IncrementDatum4keys(nsgv::gRecalTables.cycleTable, qual, isError, readGroup, baseQuality, cycleCovariate, event.index); + } + } + } +} + +// 数据总结 +void collapseQualityScoreTableToReadGroupTable(Array2D &byReadGroupTable, Array3D &byQualTable) { + // 遍历quality table + for (int k1 = 0; k1 < byQualTable.data.size(); ++k1) { + for (int k2 = 0; k2 < byQualTable[k1].size(); ++k2) { + for (int k3 = 0; k3 < byQualTable[k1][k2].size(); ++k3) { + auto& qualDatum = byQualTable[k1][k2][k3]; + if (qualDatum.numObservations > 0) { + int rgKey = k1; + int eventIndex = k3; + // spdlog::info("k1 {}, k2 {}, k3 {}, numMis {}", k1, k2, k3, qualDatum.numMismatches); + byReadGroupTable[rgKey][eventIndex].combine(qualDatum); + // spdlog::info("rg {} {}, k3 {}", byReadGroupTable[rgKey][eventIndex].numMismatches, byReadGroupTable[rgKey][eventIndex].reportedQuality, k3); + } + } + } + } +} + +/** + * To replicate the results of BQSR whether or not we save tables to disk (which we need in Spark), + * we need to trim the numbers to a few decimal placed (that's what writing and reading does). + */ +void roundTableValues(RecalTables& rt) { +#define _round_val(val) \ + do { \ + if (val.numObservations > 0) { \ + val.numMismatches = MathUtils::RoundToNDecimalPlaces(val.numMismatches, RecalUtils::NUMBER_ERRORS_DECIMAL_PLACES); \ + val.reportedQuality = MathUtils::RoundToNDecimalPlaces(val.reportedQuality, RecalUtils::REPORTED_QUALITY_DECIMAL_PLACES); \ + } \ + } while (0) + + _Foreach2D(rt.readGroupTable, val, { _round_val(val); }); + _Foreach3D(rt.qualityScoreTable, val, { _round_val(val); }); + _Foreach4D(rt.contextTable, val, { _round_val(val); }); + _Foreach4D(rt.cycleTable, val, { _round_val(val); }); +} + // 串行bqsr int SerialBQSR() { int round = 0; @@ -601,6 +700,9 @@ int SerialBQSR() { int64_t readNumSum = 0; // 0. 初始化一些全局数据 // BAQ baq{BAQ::DEFAULT_GOP}; + RecalDatum::StaticInit(); + QualityUtils::StaticInit(); + MathUtils::StaticInit(); // 1. 协变量数据相关初始化 PerReadCovariateMatrix readCovariates; @@ -608,6 +710,14 @@ int SerialBQSR() { ContextCovariate::InitContextCovariate(nsgv::gBqsrArg); CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg); + // 注意初始化顺序,这个必须在协变量初始化之后 + nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg); + nsgv::gEventTypes.push_back(EventType::BASE_SUBSTITUTION); + if (nsgv::gBqsrArg.computeIndelBQSRTables) { + nsgv::gEventTypes.push_back(EventType::BASE_INSERTION); + nsgv::gEventTypes.push_back(EventType::BASE_DELETION); + } + // 2. 读取bam的read group if (nsgv::gInBamHeader->hrecs->nrg == 0) { spdlog::error("No RG tag found in the header!"); @@ -641,8 +751,10 @@ int SerialBQSR() { // 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理 // 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略 // 4. 对read的两端进行检测,去除(hardclip)adapter - BamWrap *bw = bams[i]; + // spdlog::info("bam idx: {}", i); + BamWrap* bw = bams[i]; SamData ad; + ad.bw = bw; ad.read_len = BamWrap::BamEffectiveLength(bw->b); ad.cigar_end = bw->b->core.n_cigar; if (ad.read_len <= 0) continue; @@ -676,12 +788,16 @@ int SerialBQSR() { vector isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是 vector isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是 const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel); + // spdlog::info("nErrors: {}", nErrors); + // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); } + + //exit(0); // 7. 计算baqArray // BAQ = base alignment quality // note for efficiency reasons we don't compute the BAQ array unless we actually have // some error to marginalize over. For ILMN data ~85% of reads have no error - vector baqArray; + vector baqArray; bool baqCalculated = false; if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) { baqCalculated = flatBAQArray(bw, ad, baqArray); @@ -699,28 +815,49 @@ int SerialBQSR() { int end_pos = bw->contig_end_pos(); //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, // bw->GetReadNegativeStrandFlag() ? "reverse" : "forward"); - + // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); } // 9. 计算这条read需要跳过的位置 - vector skip(ad.read_len, 0); + vector skips(ad.read_len, 0); PROF_START(known_sites); - calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skip); + calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skips); for (int ii = 0; ii < ad.read_len; ++ii) { - skip[ii] = - skip[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || ad.quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; + skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || + ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; } PROF_END(gprof[GP_read_vcf], known_sites); // 10. 根据BAQ进一步处理snp,indel,得到处理后的数据 vector snpErrors, insErrors, delErrors; + // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-2: {}", val); } calculateFractionalErrorArray(isSNP, baqArray, snpErrors); calculateFractionalErrorArray(isIns, baqArray, insErrors); calculateFractionalErrorArray(isDel, baqArray, delErrors); + // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); } + //spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), ad.read_len); + //for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); } + // aggregate all of the info into our info object, and update the data // 11. 合并之前计算的数据,得到info,并更新bqsr table数据 - + ReadRecalInfo info(ad, readCovariates, skips, snpErrors, insErrors, delErrors); + int m = 0; + // for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); } + //exit(0); + PROF_START(update_info); + updateRecalTablesForRead(info); + PROF_END(gprof[GP_update_info], update_info); } + // exit(0); + // 12. 创建总结数据 + collapseQualityScoreTableToReadGroupTable(nsgv::gRecalTables.readGroupTable, nsgv::gRecalTables.qualityScoreTable); + roundTableValues(nsgv::gRecalTables); + + // 13. 量化质量分数 + QuantizationInfo quantInfo(nsgv::gRecalTables, nsgv::gBqsrArg.QUANTIZING_LEVELS); + + // 14. 输出结果 + RecalUtils::outputRecalibrationReport(nsgv::gBqsrArg, quantInfo, nsgv::gRecalTables); #if 0 // spdlog::info("region: {} - {}", bams[0]->global_softclip_start(), bams.back()->global_softclip_end()); // 1. 获取bams数组覆盖的region范围 diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index ede899c..2d994b3 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -4,6 +4,7 @@ EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"}; EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"}; EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"}; +vector EventType::EVENTS = {BASE_SUBSTITUTION, BASE_INSERTION, BASE_DELETION}; // static变量 for ContextCovariate int ContextCovariate::mismatchesContextSize; diff --git a/src/bqsr/covariate.h b/src/bqsr/covariate.h index 860d6f8..2c73aa0 100644 --- a/src/bqsr/covariate.h +++ b/src/bqsr/covariate.h @@ -25,7 +25,7 @@ using std::string; using std::vector; /** - * This is where we store the pre-read covariates, also indexed by (event type) and (read position). + * This is where we store the per-read covariates, also indexed by (event type) and (read position). * Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }. * For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle). */ @@ -36,6 +36,7 @@ struct EventTypeValue { int index; // 在协变量数组中对应的索引 char representation; string longRepresentation; + bool operator==(const EventTypeValue& a) const { return a.index == index; } }; struct EventType { @@ -43,6 +44,7 @@ struct EventType { static EventTypeValue BASE_SUBSTITUTION; static EventTypeValue BASE_INSERTION; static EventTypeValue BASE_DELETION; + static vector EVENTS; }; // 协变量相关的工具类 @@ -140,6 +142,17 @@ struct ContextCovariate { baseIndexMap['t'] = 3; } + static int MaximumKeyValue() { + int length = max(mismatchesContextSize, indelsContextSize); + int key = length; + int bitOffset = LENGTH_BITS; + for (int i = 0; i < length; ++i) { + key |= (3 << bitOffset); + bitOffset += 2; + } + return key; + } + static int CreateMask(int contextSize) { int mask = 0; // create 2*contextSize worth of bits @@ -161,6 +174,41 @@ struct ContextCovariate { return isNegativeStrand ? (readLength - offset - 1) : offset; } + static char baseIndexToSimpleBase(const int baseIndex) { + switch (baseIndex) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return '.'; + } + } + /** + * Converts a key into the dna string representation. + * + * @param key the key representing the dna sequence + * @return the dna sequence represented by the key + */ + static string ContextFromKey(const int key) { + int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases + int offset = LENGTH_BITS; + + string dna; + for (int i = 0; i < length; i++) { + int baseIndex = (key & mask) >> offset; + dna.push_back(baseIndexToSimpleBase(baseIndex)); + mask <<= 2; // move the mask over to the next 2 bits + offset += 2; + } + return dna; + } + // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail); // Creates a int representation of a given dna string. @@ -180,6 +228,8 @@ struct CycleCovariate { static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; } + static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; } + /** * Encodes the cycle number as a key. */ @@ -200,10 +250,27 @@ struct CycleCovariate { result++; // negative cycles get the lower-most bit set } return result; -} + } + + /** + * Decodes the cycle number from the key. + */ + static int CycleFromKey(const int key) { + int cycle = key >> 1; // shift so we can remove the "sign" bit + if ((key & 1) != 0) { // is the last bit set? + cycle *= -1; // then the cycle is negative + } + return cycle; + } // Computes the encoded value of CycleCovariate's key for the given position at the read. static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle); static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); +}; + +// 好像不需要 +struct StandardCovariateList { + ReadGroupCovariate readGroupCovariate; + BaseQualityCovariate qualityScoreCovariate; }; \ No newline at end of file diff --git a/src/bqsr/nested_array.h b/src/bqsr/nested_array.h new file mode 100644 index 0000000..587e445 --- /dev/null +++ b/src/bqsr/nested_array.h @@ -0,0 +1,194 @@ +/* + Description: 多维度嵌套的数组 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ +#pragma once + +#include +#include +#include "spdlog/spdlog.h" + +using std::vector; + +template +struct Array2D { + vector> data; + Array2D() { } + Array2D(int dim1, int dim2) { init(dim1, dim2); } + void init(int dim1, int dim2) { data.resize(dim1); for (auto& v : data) v.resize(dim2); } + inline T& get(int k1, int k2) { return data[k1][k2]; } + // 根据关键字,在对应位置插入数据 + inline void put(const T& value, int k1, int k2) { data[k1][k2] = value; } + inline vector& operator[](size_t idx) { return data[idx]; } + inline const vector& operator[](size_t idx) const { return data[idx]; } +#define _Foreach2D(array, valName, codes) \ + for (auto& arr1 : array.data) { \ + for (auto& valName : arr1) { \ + codes; \ + } \ + } +#define _Foreach2DK(array, valName, codes) \ + do { \ + int k1 = 0; \ + for (auto& arr1 : array.data) { \ + int k2 = 0; \ + for (auto& valName : arr1) { \ + codes; \ + ++k2; \ + } \ + ++k1; \ + } \ + } while (0) +}; + +template +struct Array3D { + vector>> data; + Array3D() {} + Array3D(int dim1, int dim2, int dim3) { init(dim1, dim2, dim3); } + void init(int dim1, int dim2, int dim3) { + data.resize(dim1); + for (auto& v : data) v.resize(dim2); + for (auto& v1 : data) + for (auto& v2 : v1) v2.resize(dim3); + } + inline T& get(int k1, int k2, int k3) { return data[k1][k2][k3]; } + // 根据关键字,在对应位置插入数据 + inline void put(const T& value, int k1, int k2, int k3) { data[k1][k2][k3] = value; } + inline vector>& operator[](size_t idx) { return data[idx]; } +#define _Foreach3D(array, valName, codes) \ + for (auto& arr1 : array.data) { \ + for (auto& arr2 : arr1) { \ + for (auto& valName : arr2) { \ + codes; \ + } \ + } \ + } + +#define _Foreach3DK(array, valName, codes) \ + do { \ + int k1 = 0; \ + for (auto& arr1 : array.data) { \ + int k2 = 0; \ + for (auto& arr2 : arr1) { \ + int k3 = 0; \ + for (auto& valName : arr2) { \ + codes; \ + ++k3; \ + } \ + ++k2; \ + } \ + ++k1; \ + } \ + } while (0) +}; + +template +struct Array4D { + vector>>> data; + Array4D() {} + Array4D(int dim1, int dim2, int dim3, int dim4) { init(dim1, dim2, dim3, dim4); } + void init(int dim1, int dim2, int dim3, int dim4) { + data.resize(dim1); + for (auto& v : data) v.resize(dim2); + for (auto& v1 : data) + for (auto& v2 : v1) v2.resize(dim3); + for (auto& v1 : data) + for (auto& v2 : v1) + for (auto& v3 : v2) v3.resize(dim4); + } + inline T& get(int k1, int k2, int k3, int k4) { return data[k1][k2][k3][k4]; } + // 根据关键字,在对应位置插入数据 + inline void put(const T& value, int k1, int k2, int k3, int k4) { data[k1][k2][k3][k4] = value; } + inline vector>>& operator[](size_t idx) { return data[idx]; } +#define _Foreach4D(array, valName, codes) \ + for (auto& arr1 : array.data) { \ + for (auto& arr2 : arr1) { \ + for (auto& arr3 : arr2) { \ + for (auto& valName : arr3) { \ + codes; \ + } \ + } \ + } \ + } + +#define _Foreach4DK(array, valName, codes) \ + do { \ + int k1 = 0; \ + for (auto& arr1 : array.data) { \ + int k2 = 0; \ + for (auto& arr2 : arr1) { \ + int k3 = 0; \ + for (auto& arr3 : arr2) { \ + int k4 = 0; \ + for (auto& valName : arr3) { \ + codes; \ + ++k4; \ + } \ + ++k3; \ + } \ + ++k2; \ + } \ + ++k1; \ + } \ + } while (0) +}; + +// 类似一个tensor +template +struct NestedArray { + vector data; + vector dimensions; + vector dim_offset; + + NestedArray() { } + + template + NestedArray(Args... dims) { + init(dims...); + } + + template + void init(Args... dims) { + (dimensions.emplace_back(std::forward(dims)), ...); + // spdlog::info("dimensions: {}", dimensions.size()); + // for (auto& val : dimensions) spdlog::info("dim: {}", val); + dim_offset.resize(dimensions.size(), 1); + for (int i = dimensions.size() - 2; i >= 0; --i) { + dim_offset[i] = dim_offset[i + 1] * dimensions[i + 1]; + } + data.resize(dimensions[0] * dim_offset[0]); + // for (int i = 0; i < data.size(); ++i) data[i] = i; + // for (auto& val : dim_offset) spdlog::info("dim offset: {}", val); + } + + // 根据索引位置获取数据 + template + T& get(Args... keys) { + vector keyArr; + (keyArr.emplace_back(std::forward(keys)), ...); + assert(keyArr.size() == dimensions.size()); + int idx = 0; + for (int i = 0; i < keyArr.size(); ++i) { + idx += keyArr[i] * dim_offset[i]; + } + return data[idx]; + } + + // 根据关键字,在对应位置插入数据 + template + void put(T value, Args... keys) { + vector keyArr; + (keyArr.emplace_back(std::forward(keys)), ...); + assert(keyArr.size() == dimensions.size()); + int idx = 0; + for (int i = 0; i < keyArr.size(); ++i) { + idx += keyArr[i] * dim_offset[i]; + } + data[idx] = value; + } +}; \ No newline at end of file diff --git a/src/bqsr/qual_quantizer.h b/src/bqsr/qual_quantizer.h new file mode 100644 index 0000000..53651b5 --- /dev/null +++ b/src/bqsr/qual_quantizer.h @@ -0,0 +1,315 @@ +/* + Description: 量化质量分数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include +#include +#include +#include "qual_utils.h" + +using std::set; +using std::vector; + +/** + * A general algorithm for quantizing quality score distributions to use a specific number of levels + * + * Takes a histogram of quality scores and a desired number of levels and produces a + * map from original quality scores -> quantized quality scores. + * + * Note that this data structure is fairly heavy-weight, holding lots of debugging and + * calculation information. If you want to use it efficiently at scale with lots of + * read groups the right way to do this: + * + * Map> map + * for each read group rg: + * hist = getQualHist(rg) + * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) + * map.set(rg, qq.getOriginalToQuantizedMap()) + * + * This map would then be used to look up the appropriate original -> quantized + * quals for each read as it comes in. + */ + +struct QualQuantizer { + + /** + * Represents a contiguous interval of quality scores. + * + * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 + */ + struct QualInterval { + int qStart, qEnd, fixedQual, level; + int64_t nObservations, nErrors; + set subIntervals; + + /** for debugging / visualization. When was this interval created? */ + int mergeOrder; + + void init(const int _qStart, const int _qEnd, const int64_t _nObservations, const int64_t _nErrors, const int _level, const int _fixedQual) { + qStart = _qStart; + qEnd = _qEnd; + nObservations = _nObservations; + nErrors = _nErrors; + level = _level; + fixedQual = _fixedQual; + } + + QualInterval() { + qStart = -1; + qEnd = -1; + nObservations = -1; + nErrors = -1; + fixedQual = -1; + level = -1; + mergeOrder = 0; + } + + QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level) { + init(qStart, qEnd, nObservations, nErrors, level, -1); + } + + QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level, + const set& _subIntervals) { + init(qStart, qEnd, nObservations, nErrors, level, -1); + subIntervals = _subIntervals; + } + + QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level, const int fixedQual) { + init(qStart, qEnd, nObservations, nErrors, level, fixedQual); + } + + QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level, const int fixedQual, + const set& _subIntervals) { + init(qStart, qEnd, nObservations, nErrors, level, fixedQual); + subIntervals = _subIntervals; + } + + /** + * @return Human readable name of this interval: e.g., 10-12 + */ + string getName() const { return std::to_string(qStart) + "-" + std::to_string(qEnd); } + + string toString() const { + return "QQ:" + getName(); + } + + /** + * @return true if this bin is using a fixed qual + */ + bool hasFixedQual() const { return fixedQual != -1; } + + /** + * @return the error rate (in real space) of this interval, or 0 if there are no observations + */ + double getErrorRate() const { + if (hasFixedQual()) + return QualityUtils::qualToErrorProb((uint8_t)fixedQual); + else if (nObservations == 0) + return 0.0; + else + return (nErrors + 1) / (1.0 * (nObservations + 1)); + } + + /** + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. + */ + uint8_t getQual() const { + if (!hasFixedQual()) + return QualityUtils::errorProbToQual(getErrorRate()); + else + return (uint8_t)fixedQual; + } + + int compareTo(const QualInterval& qi) const { return qStart < qi.qStart ? -1 : (qStart == qi.qStart ? 0 : 1); } + + /** + * Create a interval representing the merge of this interval and toMerge + * + * Errors and observations are combined + * Subintervals updated in order of left to right (determined by qStart) + * Level is 1 + highest level of this and toMerge + * Order must be updated elsewhere + * + * @param toMerge + * @return newly created merged QualInterval + */ + QualInterval merge(const QualInterval& toMerge) const { + const QualInterval &left = this->compareTo(toMerge) < 0 ? *this : toMerge; + const QualInterval &right = this->compareTo(toMerge) < 0 ? toMerge : *this; + + if (left.qEnd + 1 != right.qStart) { + // throw new GATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); + std::cerr << "Attempting to merge non-contiguous intervals: left = " + left.toString() + " right = " + right.toString() << std::endl; + exit(1); + } + const int64_t nCombinedObs = left.nObservations + right.nObservations; + const int64_t nCombinedErr = left.nErrors + right.nErrors; + + const int level = std::max(left.level, right.level) + 1; + set subIntervals; + subIntervals.insert(left); + subIntervals.insert(right); + QualInterval merged(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); + + return merged; + } + + double getPenalty(const int minInterestingQual) const { return calcPenalty(getErrorRate(), minInterestingQual); } + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subIntervals as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + double calcPenalty(const double globalErrorRate, const int minInterestingQual) const { + if (globalErrorRate == 0.0) // there were no observations, so there's no penalty + return 0.0; + + if (subIntervals.empty()) { + // this is leave node + if (this->qEnd <= minInterestingQual) + // It's free to merge up quality scores below the smallest interesting one + return 0; + else { + return (std::abs(std::log10(getErrorRate()) - std::log10(globalErrorRate))) * nObservations; + } + } else { + double sum = 0; + for (const QualInterval interval : subIntervals) sum += interval.calcPenalty(globalErrorRate, minInterestingQual); + return sum; + } + } + bool operator<(const QualInterval& o) const { + return qStart < o.qStart; + } + + QualInterval& operator=(const QualInterval& o) { + if (this == &o) return *this; + init(o.qStart, o.qEnd, o.nObservations, o.nErrors, o.level, o.fixedQual); + mergeOrder = o.mergeOrder; + subIntervals.clear(); + for (auto& val : o.subIntervals) subIntervals.insert(val); + return *this; + } + }; + + /** + * Inputs to the QualQuantizer + */ + const int nLevels, minInterestingQual; + vector& nObservationsPerQual; + + QualQuantizer(vector& _nObservationsPerQual, const int _nLevels, const int _minInterestingQual) + : nObservationsPerQual(_nObservationsPerQual), nLevels(_nLevels), minInterestingQual(_minInterestingQual) { + quantize(); + } + + /** Sorted set of qual intervals. + * + * After quantize() this data structure contains only the top-level qual intervals + */ + set quantizedIntervals; + /** + * Represents a contiguous interval of quality scores. + * + * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 + */ + + void getOriginalToQuantizedMap(vector& quantMap) { + quantMap.resize(getNQualsInHistogram(), UINT8_MAX); + + for (auto& interval : quantizedIntervals) { + for (int q = interval.qStart; q <= interval.qEnd; q++) { + quantMap[q] = interval.getQual(); + } + } + // if (Collections.min(map) == Byte.MIN_VALUE) throw new GATKException("quantized quality score map contains an un-initialized value"); + } + + int getNQualsInHistogram() { return nObservationsPerQual.size(); } + + /** + * Main method for computing the quantization intervals. + * + * Invoked in the constructor after all input variables are initialized. Walks + * over the inputs and builds the min. penalty forest of intervals with exactly nLevel + * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed + * to find the optimal combination. + * + * TODO: develop a smarter algorithm + * + * @return the forest of intervals with size == nLevels + */ + void quantize() { + // create intervals for each qual individually + auto& intervals = quantizedIntervals; + for (int qStart = 0; qStart < getNQualsInHistogram(); qStart++) { + const int64_t nObs = nObservationsPerQual.at(qStart); + const double errorRate = QualityUtils::qualToErrorProb((uint8_t)qStart); + const double nErrors = nObs * errorRate; + const QualInterval qi(qStart, qStart, nObs, (int)std::floor(nErrors), 0, (uint8_t)qStart); + intervals.insert(qi); + } + + // greedy algorithm: + // while ( n intervals >= nLevels ): + // find intervals to merge with least penalty + // merge it + while (intervals.size() > nLevels) { + mergeLowestPenaltyIntervals(intervals); + } + } + + /** + * Helper function that finds and merges together the lowest penalty pair of intervals + * @param intervals + */ + void mergeLowestPenaltyIntervals(set& intervals) { + // setup the iterators + auto it1 = intervals.begin(); + auto it1p = intervals.begin(); + ++it1p; // skip one + + // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty + QualInterval minMerge; + // if (logger.isDebugEnabled()) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); + int lastMergeOrder = 0; + while (it1p != intervals.end()) { + const QualInterval& left = *it1; + const QualInterval& right = *it1p; + const QualInterval merged = left.merge(right); + lastMergeOrder = std::max(std::max(lastMergeOrder, left.mergeOrder), right.mergeOrder); + if (minMerge.qStart == -1 || (merged.getPenalty(minInterestingQual) < minMerge.getPenalty(minInterestingQual))) { + // if (logger.isDebugEnabled()) logger.debug(" Updating merge " + minMerge); + minMerge = merged; // merge two bins that when merged incur the lowest "penalty" + } + ++it1; + ++it1p; + } + // now actually go ahead and merge the minMerge pair + // if (logger.isDebugEnabled()) logger.debug(" => const min merge " + minMerge); + minMerge.mergeOrder = lastMergeOrder + 1; + // intervals.removeAll(minMerge.subIntervals); + for (auto &itr : minMerge.subIntervals) { + intervals.erase(itr); + } + intervals.insert(minMerge); + // if (logger.isDebugEnabled()) logger.debug("updated intervals: " + intervals); + } + + +}; \ No newline at end of file diff --git a/src/bqsr/qual_utils.cpp b/src/bqsr/qual_utils.cpp new file mode 100644 index 0000000..6c3123a --- /dev/null +++ b/src/bqsr/qual_utils.cpp @@ -0,0 +1,12 @@ +#include "qual_utils.h" + +const double QualityUtils::PHRED_TO_LOG_PROB_MULTIPLIER = -std::log(10) / 10.0; + +const double QualityUtils::LOG_PROB_TO_PHRED_MULTIPLIER = 1 / PHRED_TO_LOG_PROB_MULTIPLIER; + +const double QualityUtils::MIN_LOG10_SCALED_QUAL = std::log10(DBL_MIN); + +const double QualityUtils::MIN_PHRED_SCALED_QUAL = -10.0 * MIN_LOG10_SCALED_QUAL; + +double QualityUtils::qualToErrorProbCache[MAX_QUAL + 1]; +double QualityUtils::qualToProbLog10Cache[MAX_QUAL + 1]; \ No newline at end of file diff --git a/src/bqsr/qual_utils.h b/src/bqsr/qual_utils.h new file mode 100644 index 0000000..35d4c05 --- /dev/null +++ b/src/bqsr/qual_utils.h @@ -0,0 +1,406 @@ +/* + Description: 质量分数相关的工具函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/25 +*/ +#pragma once + +#include +#include +#include +#include +#include + +#include "util/math/math_utils.h" + +using std::vector; + +struct QualityUtils { + + static void StaticInit() { + for (int i = 0; i <= MAX_QUAL; i++) { + qualToErrorProbCache[i] = qualToErrorProb((double)i); + qualToProbLog10Cache[i] = std::log10(1.0 - qualToErrorProbCache[i]); + } + } + + /** + * Maximum quality score that can be encoded in a SAM/BAM file + */ + + static constexpr uint8_t MAX_SAM_QUAL_SCORE = 93; + /** + * bams containing quals above this value are extremely suspicious and we should warn the user + */ + static constexpr uint8_t MAX_REASONABLE_Q_SCORE = 60; + + /** + * conversion factor from phred scaled quality to log error probability and vice versa + */ + + static const double PHRED_TO_LOG_PROB_MULTIPLIER; + + static const double LOG_PROB_TO_PHRED_MULTIPLIER; + + static const double MIN_LOG10_SCALED_QUAL; + + static const double MIN_PHRED_SCALED_QUAL; + + + /** + * The lowest quality score for a base that is considered reasonable for statistical analysis. This is + * because Q 6 => you stand a 25% of being right, which means all bases are equally likely + */ + + static constexpr uint8_t MIN_USABLE_Q_SCORE = 6; + + static constexpr int MAPPING_QUALITY_UNAVAILABLE = 255; + + /** + * Maximum sense quality value. + */ + + static constexpr int MAX_QUAL = 254; + + /** + * Cached values for qual as uint8_t calculations so they are very fast + */ + static double qualToErrorProbCache[MAX_QUAL + 1]; + static double qualToProbLog10Cache[MAX_QUAL + 1]; + + // ---------------------------------------------------------------------- + // + // These are all functions to convert a phred-scaled quality score to a probability + // + // ---------------------------------------------------------------------- + + /** + * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a discretized byte value, this function uses a cache so is very efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a quality score (0-255) + * @return a probability (0.0-1.0) + */ + static double qualToProb(const uint8_t qual) { return 1.0 - qualToErrorProb(qual); } + + /** + * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + static double qualToProb(const double qual) { + // Utils.validateArg(qual >= 0.0, ()->"qual must be >= 0.0 but got " + qual); + return 1.0 - qualToErrorProb(qual); + } + + /** + * Convert a phred-scaled quality score to its log10 probability of being true (Q30 => log10(0.999)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + static double qualToProbLog10(const uint8_t qual) { + return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + + /** + * Convert a log-probability to a phred-scaled value ( log(0.001) => 30 ) + * + * @param prob a log-probability + * @return a phred-scaled value, not necessarily integral or bounded by {@code MAX_QUAL} + */ + static double logProbToPhred(const double prob) { return prob * LOG_PROB_TO_PHRED_MULTIPLIER; } + + /** + * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + + static double qualToErrorProb(const double qual) { + assert(qual >= 0.0); + return std::pow(10.0, qual / -10.0); + } + + /**v + * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a byte value, this function uses a cache so is very efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a byte + * @return a probability (0.0-1.0) + */ + static double qualToErrorProb(const uint8_t qual) { + return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + + /** + * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * The calculation is extremely efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a byte + * @return a probability (0.0-1.0) + */ + static double qualToErrorProbLog10(const uint8_t qual) { return qualToErrorProbLog10((double)(qual & 0xFF)); } + + /** + * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * The calculation is extremely efficient + * + * @param qual a phred-scaled quality score encoded as a double + * @return log of probability (0.0-1.0) + */ + static double qualToErrorProbLog10(const double qual) { + // Utils.validateArg(qual >= 0.0, ()->"qual must be >= 0.0 but got " + qual); + return qual * -0.1; + } + + // ---------------------------------------------------------------------- + // + // Functions to convert a probability to a phred-scaled quality score + // + // ---------------------------------------------------------------------- + + /** + * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). + * + * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE + * and by 1 at the low-end. + * + * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) + * @return a quality score (0-MAX_SAM_QUAL_SCORE) + */ + static uint8_t errorProbToQual(const double errorRate) { return errorProbToQual(errorRate, MAX_SAM_QUAL_SCORE); } + + /** + * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). + * + * Note, this function caps the resulting quality score by the public static value MIN_REASONABLE_ERROR + * and by 1 at the low-end. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) + * @return a quality score (0-maxQual) + */ + static uint8_t errorProbToQual(const double errorRate, const uint8_t maxQual) { + // Utils.validateArg(MathUtils.isValidProbability(errorRate), ()->"errorRate must be good probability but got " + errorRate); + const double d = std::round(-10.0 * std::log10(errorRate)); + return boundQual((int)d, maxQual); + } + + /** + * @see #errorProbToQual(double, byte) with proper conversion of maxQual integer to a byte + */ + static uint8_t errorProbToQual(const double prob, const int maxQual) { + // Utils.validateArg(maxQual >= 0 && maxQual <= 255, ()->"maxQual must be between 0-255 but got " + maxQual); + return errorProbToQual(prob, (uint8_t)(maxQual & 0xFF)); + } + + /** + * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). + * + * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE + * and by 1 at the low-end. + * + * @param prob a probability (0.0-1.0) of being right + * @return a quality score (0-MAX_SAM_QUAL_SCORE) + */ + static uint8_t trueProbToQual(const double prob) { return trueProbToQual(prob, MAX_SAM_QUAL_SCORE); } + + /** + * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). + * + * Note, this function caps the resulting quality score by the min probability allowed (EPS). + * So for example, if prob is 1e-6, which would imply a Q-score of 60, and EPS is 1e-4, + * the result of this function is actually Q40. + * + * Note that the resulting quality score, regardless of EPS, is capped by MAX_SAM_QUAL_SCORE and + * bounded on the low-side by 1. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param trueProb a probability (0.0-1.0) of being right + * @param maxQual the maximum quality score we are allowed to emit here, regardless of the error rate + * @return a phred-scaled quality score (0-maxQualScore) as a byte + */ + static uint8_t trueProbToQual(const double trueProb, const uint8_t maxQual) { + // Utils.validateArg(MathUtils.isValidProbability(trueProb), ()->"trueProb must be good probability but got " + trueProb); + const double lp = std::round(-10.0 * MathUtils::log10OneMinusX(trueProb)); + return boundQual((int)lp, maxQual); + } + + /** + * @see #trueProbToQual(double, byte) with proper conversion of maxQual to a byte + */ + static uint8_t trueProbToQual(const double prob, const int maxQual) { + // Utils.validateArg(maxQual >= 0 && maxQual <= 255, ()->"maxQual must be between 0-255 but got " + maxQual); + return trueProbToQual(prob, (uint8_t)(maxQual & 0xFF)); + } + + /** + * Convert a probability of being right to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param trueRate the probability of being right (0.0-1.0) + * @return a phred-scaled version of the error rate implied by trueRate + */ + static double phredScaleCorrectRate(const double trueRate) { return phredScaleLog10ErrorRate(MathUtils::log10OneMinusX(trueRate)); } + + /** + * Convert a probability of being wrong to a phred-scaled quality score as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param errorRate the probability of being wrong (0.0-1.0) + * @return a phred-scaled version of the error rate + */ + static double phredScaleErrorRate(const double errorRate) { return phredScaleLog10ErrorRate(std::log10(errorRate)); } + + /** + * Convert a log10 probability of being wrong to a phred-scaled quality score as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param errorRateLog10 the log10 probability of being wrong (0.0-1.0). Can be -Infinity, in which case + * the result is MIN_PHRED_SCALED_QUAL + * @return a phred-scaled version of the error rate + */ + static double phredScaleLog10ErrorRate(const double errorRateLog10) { + // Utils.validateArg(MathUtils.isValidLog10Probability(errorRateLog10), ()->"errorRateLog10 must be good probability but got " + errorRateLog10); + + // abs is necessary for edge base with errorRateLog10 = 0 producing -0.0 doubles + return std::abs(-10.0 * std::max(errorRateLog10, MIN_LOG10_SCALED_QUAL)); + } + + /** + * Convert a log10 probability of being right to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param trueRateLog10 the log10 probability of being right (0.0-1.0). Can be -Infinity to indicate + * that the result is impossible in which MIN_PHRED_SCALED_QUAL is returned + * @return a phred-scaled version of the error rate implied by trueRate + */ + static double phredScaleLog10CorrectRate(const double trueRateLog10) { return phredScaleCorrectRate(std::pow(10.0, trueRateLog10)); } + + // ---------------------------------------------------------------------- + // + // Routines to bound a quality score to a reasonable range + // + // ---------------------------------------------------------------------- + + /** + * Return a quality score that bounds qual by MAX_SAM_QUAL_SCORE and 1 + * + * @param qual the uncapped quality score as an integer + * @return the bounded quality score + */ + static uint8_t boundQual(int qual) { return boundQual(qual, MAX_SAM_QUAL_SCORE); } + + /** + * Return a quality score that bounds qual by maxQual and 1 + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual the uncapped quality score as an integer. Can be < 0 (which may indicate an error in the + * client code), which will be brought back to 1, but this isn't an error, as some + * routines may use this functionality (BaseRecalibrator, for example) + * @param maxQual the maximum quality score, must be less < 255 + * @return the bounded quality score + */ + static uint8_t boundQual(const int qual, const uint8_t maxQual) { return (uint8_t)(std::max(std::min(qual, maxQual & 0xFF), 1) & 0xFF); } + + /** + * Calculate the sum of phred scores. + * @param phreds the phred score values. + * @return the sum. + */ + static double phredSum(const vector& phreds) { + switch (phreds.size()) { + case 0: + return DBL_MAX; + case 1: + return phreds[0]; + case 2: + return phredSum(phreds[0], phreds[1]); + case 3: + return phredSum(phreds[0], phreds[1], phreds[2]); + default: + vector log10Vals(phreds.size()); // todo: 优化加速 + for (int i = 0; i < log10Vals.size(); i++) { + log10Vals[i] = phreds[i] * -0.1; + } + return -10 * MathUtils::log10SumLog10(log10Vals); + } + } + + /** + * Calculate the sum of two phred scores. + *

+ * As any sum, this operation is commutative. + *

+ * @param a first phred score value. + * @param b second phred score value. + * @return the sum. + */ + static double phredSum(const double a, const double b) { return -10 * MathUtils::log10SumLog10(a * -.1, b * -.1); } + + /** + * Calculate the sum of three phred scores. + *

+ * As any sum, this operation is commutative. + *

+ * @param a first phred score value. + * @param b second phred score value. + * @param c thrid phred score value. + * @return the sum. + */ + static double phredSum(const double a, const double b, const double c) { return -10 * MathUtils::log10SumLog10(a * -.1, b * -.1, c * -.1); } +}; \ No newline at end of file diff --git a/src/bqsr/quant_info.h b/src/bqsr/quant_info.h new file mode 100644 index 0000000..968a5ac --- /dev/null +++ b/src/bqsr/quant_info.h @@ -0,0 +1,41 @@ +/* + Description: 做一些量化处理 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ +#pragma once + +#include +#include +#include "recal_tables.h" +#include "qual_utils.h" +#include "util/math/math_utils.h" +#include "qual_quantizer.h" + +using std::vector; + +struct QuantizationInfo { + vector quantizedQuals; + vector empiricalQualCounts; // 直方图 + int quantizationLevels; // 量化等级 + + QuantizationInfo() {} + + QuantizationInfo(RecalTables &recalTables, const int levels) { + quantizationLevels = levels; + empiricalQualCounts.resize(QualityUtils::MAX_SAM_QUAL_SCORE + 1, 0); + auto& qualTable = recalTables.qualityScoreTable; + _Foreach3D(qualTable, val, { + // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + const int empiricalQual = MathUtils::fastRound(val.getEmpiricalQuality()); + empiricalQualCounts[empiricalQual] += val.getNumObservations(); // add the number of observations for every key + }); + + // quantizeQualityScores + QualQuantizer quantizer(empiricalQualCounts, levels, QualityUtils::MIN_USABLE_Q_SCORE); + quantizer.getOriginalToQuantizedMap(quantizedQuals); + } +}; \ No newline at end of file diff --git a/src/bqsr/read_recal_info.h b/src/bqsr/read_recal_info.h new file mode 100644 index 0000000..1915a89 --- /dev/null +++ b/src/bqsr/read_recal_info.h @@ -0,0 +1,76 @@ +/* + Description: 单个sam记录需要保存的数据 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ + +#pragma once + +#include "util/bam_wrap.h" +#include "covariate.h" + +struct ReadRecalInfo { + SamData& read; + int length; + PerReadCovariateMatrix& covariates; + vector& skips; + FastArray&base_quals, &ins_quals, &del_quals; + vector&snp_errs, &ins_errs, &del_errs; + + ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, vector& _skips, vector& _snp_errs, vector& _ins_errs, + vector& _del_errs) + : read(_read), + covariates(_covariates), + skips(_skips), + base_quals(_read.base_quals), + ins_quals(_read.ins_quals), + del_quals(_read.del_quals), + snp_errs(_snp_errs), + ins_errs(_ins_errs), + del_errs(_del_errs) { + length = _read.read_len; + } + + /** + * Get the qual score for event type at offset + * + * @param eventType the type of event we want the qual for + * @param offset the offset into this read for the qual + * @return a valid quality score for event at offset + */ + // 获取跟eventType对应的quality + uint8_t getQual(const EventTypeValue &event, const int offset) { + if (event == EventType::BASE_SUBSTITUTION) { + return base_quals[offset]; + } else if (event == EventType::BASE_INSERTION) { + return ins_quals[offset]; + } else { + return del_quals[offset]; + } + } + + /** + * Get the error fraction for event type at offset + * + * The error fraction is a value between 0 and 1 that indicates how much certainty we have + * in the error occurring at offset. A value of 1 means that the error definitely occurs at this + * site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight + * of the error belongs here + * + * @param eventType the type of event we want the qual for + * @param offset the offset into this read for the qual + * @return a fractional weight for an error at this offset + */ + double getErrorFraction(const EventTypeValue& event, const int offset) { + if (event == EventType::BASE_SUBSTITUTION) { + return snp_errs[offset]; + } else if (event == EventType::BASE_INSERTION) { + return ins_errs[offset]; + } else { + return del_errs[offset]; + } + } +}; \ No newline at end of file diff --git a/src/bqsr/recal_datum.cpp b/src/bqsr/recal_datum.cpp new file mode 100644 index 0000000..cda0500 --- /dev/null +++ b/src/bqsr/recal_datum.cpp @@ -0,0 +1,3 @@ +#include "recal_datum.h" + +double RecalDatum::logPriorCache[MAX_GATK_USABLE_Q_SCORE + 1]; \ No newline at end of file diff --git a/src/bqsr/recal_datum.h b/src/bqsr/recal_datum.h new file mode 100644 index 0000000..976743b --- /dev/null +++ b/src/bqsr/recal_datum.h @@ -0,0 +1,246 @@ +/* + Description: bqsr计算过程中需要记录的一些数据 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ +#pragma once + +#include +#include +#include "qual_utils.h" +#include "util/math/normal_dist.h" +#include "util/math/math_utils.h" + +/** + * The container for the 4-tuple + * + * ( reported quality, empirical quality, num observations, num mismatches/errors ) + * + * for a given set of covariates. + * + */ + +struct RecalDatum { + static constexpr uint8_t MAX_RECALIBRATED_Q_SCORE = 93; // SAMUtils.MAX_PHRED_SCORE; + static constexpr int UNINITIALIZED_EMPIRICAL_QUALITY = -1; + static constexpr double MULTIPLIER = 100000.0; // See discussion in numMismatches about what the multiplier is. + /** + * used when calculating empirical qualities to avoid division by zero + */ + static constexpr int SMOOTHING_CONSTANT = 1; + + static constexpr uint64_t MAX_NUMBER_OF_OBSERVATIONS = INT_MAX - 1; + + /** + * Quals above this value should be capped down to this value (because they are too high) + * in the base quality score recalibrator + */ + static constexpr uint8_t MAX_GATK_USABLE_Q_SCORE = 40; + + static double logPriorCache[MAX_GATK_USABLE_Q_SCORE + 1]; + + + static void StaticInit() { + // normal distribution describing P(empiricalQuality - reportedQuality). Its mean is zero because a priori we expect + // no systematic bias in the reported quality score + const double mean = 0.0; + const double sigma = 0.5; // with these parameters, deltas can shift at most ~20 Q points + const NormalDistribution gaussian(mean, sigma); + for (int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++) { + logPriorCache[i] = gaussian.logDensity(i); + } + } + + /** + * Estimated reported quality score based on combined data's individual q-reporteds and number of observations. + * The estimating occurs when collapsing counts across different reported qualities. + */ + // 测序仪给出的原始质量分数 + double reportedQuality = 0.0; + + /** + * The empirical quality for datums that have been collapsed together (by read group and reported quality, for example). + * + * This variable was historically a double, but {@link #bayesianEstimateOfEmpiricalQuality} has always returned an integer qual score. + * Thus the type has been changed to integer in February 2025 to highlight this implementation detail. It does not change the output. + */ + // 计算出来的真实质量分数 + int empiricalQuality = 0; + + /** + * Number of bases seen in total + */ + // 这个字段也用来判断当前datum的有效性,只有到numObservations > 0时,这个datum才有效,因为如果为0,说明这个datum都没有出现过 + uint64_t numObservations = 0; + + /** + * Number of bases seen that didn't match the reference + * (actually sum of the error weights - so not necessarily a whole number) + * Stored with an internal multiplier to keep it closer to the floating-point sweet spot and avoid numerical error + * (see https://github.com/broadinstitute/gatk/wiki/Numerical-errors ). + * However, the value of the multiplier influences the results. + * For example, you get different results for 1000.0 and 10000.0 + * See MathUtilsUnitTest.testAddDoubles for a demonstration. + * The value of the MULTIPLIER that we found to give consistent results insensitive to sorting is 10000.0; + */ + double numMismatches = 0.0; + + RecalDatum() {} + + RecalDatum(const uint64_t _numObservations, const double _numMismatches, const uint8_t _reportedQuality) { + numObservations = _numObservations; + numMismatches = _numMismatches * MULTIPLIER; + reportedQuality = _reportedQuality; + empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; + } + + void increment(const uint64_t incObservations, const double incMismatches) { + numObservations += incObservations; + numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that. + empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; + } + + void increment(const uint64_t incObservations, const double incMismatches, int baseQuality) { + numObservations += incObservations; + numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that. + reportedQuality = baseQuality; + empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; + } + + /** + * Add in all of the data from other into this object, updating the reported quality from the expected + * error rate implied by the two reported qualities. + * + * For example (the only example?), this method is called when collapsing the counts across reported quality scores within + * the same read group. + * + * @param other RecalDatum to combine + */ + void combine(const RecalDatum& other) { + // this is the *expected* (or theoretical) number of errors given the reported qualities and the number of observations. + double expectedNumErrors = this->calcExpectedErrors() + other.calcExpectedErrors(); + + // increment the counts + increment(other.getNumObservations(), other.getNumMismatches()); + + // we use the theoretical count above to compute the "estimated" reported quality + // after combining two datums with different reported qualities. + reportedQuality = -10 * log10(expectedNumErrors / getNumObservations()); + empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; + } + + /** + * calculate the expected number of errors given the estimated Q reported and the number of observations + * in this datum. + * + * @return a positive (potentially fractional) estimate of the number of errors + */ + inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); } + inline double getNumMismatches() const { return numMismatches / MULTIPLIER; } + inline uint64_t getNumObservations() const { return numObservations; } + inline double getReportedQuality() const { return reportedQuality; } + + /** + * Computes the empirical quality of the datum, using the reported quality as the prior. + * @see #getEmpiricalQuality(double) below. + */ + double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); } + + /** + * Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum. + */ + double getEmpiricalQuality(const double priorQualityScore) { + if (empiricalQuality == UNINITIALIZED_EMPIRICAL_QUALITY) { + calcEmpiricalQuality(priorQualityScore); + } + return empiricalQuality; + } + + /** + * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) + */ + void calcEmpiricalQuality(const double priorQualityScore) { + // smoothing is one error and one non-error observation + const uint64_t mismatches = (uint64_t)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; // TODO: why add 0.5? + const uint64_t observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + + const int empiricalQual = bayesianEstimateOfEmpiricalQuality(observations, mismatches, priorQualityScore); + + empiricalQuality = std::min(empiricalQual, (int)MAX_RECALIBRATED_Q_SCORE); + } + + /** + * Compute the maximum a posteriori (MAP) estimate of the probability of sequencing error under the following model. + * + * Let + * X = number of sequencing errors, + * n = number of observations, + * theta = probability of sequencing error as a quality score, + * theta_rep = probability of sequencing error reported by the sequencing machine as a quality score. + * + * The prior and the likelihood are: + * + * P(theta|theta_rep) ~ Gaussian(theta - theta_rep| 0, 0.5) (Note this is done in log space) + * P(X|n, theta) ~ Binom(X|n,theta) + * + * Note the prior is equivalent to + * + * P(theta|theta_rep) ~ Gaussian(theta | theta_rep, 0.5) + * + * TODO: use beta prior to do away with the search. + * + * @param nObservations n in the model above. + * @param nErrors the observed number of sequencing errors. + * @param priorMeanQualityScore the prior quality score, often the reported quality score. + * + * @return phredScale quality score that maximizes the posterior probability. + */ + static int bayesianEstimateOfEmpiricalQuality(const uint64_t nObservations, const uint64_t nErrors, const double priorMeanQualityScore) { + const int numQualityScoreBins = (QualityUtils::MAX_REASONABLE_Q_SCORE + 1); + + double logPosteriors[numQualityScoreBins]; + for (int i = 0; i < numQualityScoreBins; ++i) { + logPosteriors[i] = getLogPrior(i, priorMeanQualityScore) + getLogBinomialLikelihood(i, nObservations, nErrors); + } + return MathUtils::maxElementIndex(logPosteriors, 0, numQualityScoreBins); + } + + static double getLogPrior(const double qualityScore, const double priorQualityScore) { + const int difference = std::min(std::abs((int)(qualityScore - priorQualityScore)), (int)MAX_GATK_USABLE_Q_SCORE); + return logPriorCache[difference]; + } + + /** + * Given: + * - n, the number of observations, + * - k, the number of sequencing errors, + * - p, the probability of error, encoded as the quality score. + * + * Return the binomial probability Bin(k|n,p). + * + * The method handles the case when the counts of type long are higher than the maximum allowed integer value, + * Integer.MAX_VALUE = (2^31)-1 ~= 2*10^9, since the library we use for binomial probability expects integer input. + * + */ + static double getLogBinomialLikelihood(const double qualityScore, uint64_t nObservations, uint64_t nErrors) { + if (nObservations == 0) + return 0.0; + + // the binomial code requires ints as input (because it does caching). This should theoretically be fine because + // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow + // before casting down to an int. + if (nObservations > MAX_NUMBER_OF_OBSERVATIONS) { + // we need to decrease nErrors by the same fraction that we are decreasing nObservations + const double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; + nErrors = std::round((double)nErrors * fraction); + nObservations = MAX_NUMBER_OF_OBSERVATIONS; + } + + // this is just a straight binomial PDF + const double logLikelihood = MathUtils::logBinomialProbability((int)nObservations, (int)nErrors, QualityUtils::qualToErrorProb(qualityScore)); + return (std::isinf(logLikelihood) || std::isnan(logLikelihood)) ? -DBL_MAX : logLikelihood; + } +}; \ No newline at end of file diff --git a/src/bqsr/recal_tables.h b/src/bqsr/recal_tables.h new file mode 100644 index 0000000..7edd7fb --- /dev/null +++ b/src/bqsr/recal_tables.h @@ -0,0 +1,44 @@ +/* + Description: 保存bqsr计算的各种数据,根据这些数据生成bqsr table + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ +#pragma once + +#include "covariate.h" +#include "nested_array.h" +#include "recal_datum.h" + +struct RecalTables { + int qualDimension = 94; // MAX_SAM_QUAL_SCORE(93) + 1 + int eventDimension = EventType::EVENT_SIZE; + int numReadGroups; + + // These two tables are special + Array2D readGroupTable; + Array3D qualityScoreTable; + + // additional tables + Array4D contextTable; + Array4D cycleTable; + + // NestedArray testArr; + + RecalTables() {} + + RecalTables(int _numReadGroups) { init(_numReadGroups); } + + void init(int _numReadGroups) { + numReadGroups = _numReadGroups; + // 初始化readgroup和quality两个table + readGroupTable.init(numReadGroups, eventDimension); + qualityScoreTable.init(numReadGroups, qualDimension, eventDimension); + + // 初始化context和cycle两个table + contextTable.init(numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1, eventDimension); + cycleTable.init(numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1, eventDimension); + } +}; \ No newline at end of file diff --git a/src/bqsr/recal_utils.cpp b/src/bqsr/recal_utils.cpp new file mode 100644 index 0000000..2dfc1e0 --- /dev/null +++ b/src/bqsr/recal_utils.cpp @@ -0,0 +1,56 @@ +/* + Description: bqsr计算过程的辅助类 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ + +#include "recal_utils.h" + +/** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Note: we intentionally do not use varargs here to avoid the performance cost of allocating an array on every call. It showed on the profiler. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param key0, key1, key2 location in table of our item + */ +void RecalUtils::IncrementDatum3keys(Array3D table, uint8_t qual, double isError, int key0, int key1, int key2) { + RecalDatum &existingDatum = table.get(key0, key1, key2); + + if (existingDatum.numObservations == 0) { + // No existing item, put a new one + table.put(RecalDatum(1, qual, isError), key0, key1, key2); + } else { + // Easy case: already an item here, so increment it + existingDatum.increment(1, isError); + } +} + +/** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Note: we intentionally do not use varargs here to avoid the performance cost of allocating an array on every call. It showed on the profiler. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param key0, key1, key2, key3 location in table of our item + */ +void RecalUtils::IncrementDatum4keys(Array4D table, uint8_t qual, double isError, int key0, int key1, int key2, int key3) { + RecalDatum& existingDatum = table.get(key0, key1, key2, key3); + + if (existingDatum.numObservations == 0) { + // No existing item, put a new one + table.put(RecalDatum(1, qual, isError), key0, key1, key2, key3); + } else { + // Easy case: already an item here, so increment it + existingDatum.increment(1, isError); + } +} \ No newline at end of file diff --git a/src/bqsr/recal_utils.h b/src/bqsr/recal_utils.h new file mode 100644 index 0000000..072eeff --- /dev/null +++ b/src/bqsr/recal_utils.h @@ -0,0 +1,207 @@ +/* + Description: bqsr计算过程的辅助类 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/24 +*/ +#pragma once + +#include + +#include + +#include "bqsr_args.h" +#include "nested_array.h" +#include "quant_info.h" +#include "recal_datum.h" +#include "recal_tables.h" +#include "util/report_table.h" +#include "util/utils.h" +#include "covariate.h" + +struct RecalUtils { + + static constexpr int EMPIRICAL_QUAL_DECIMAL_PLACES = 4; + static constexpr int REPORTED_QUALITY_DECIMAL_PLACES = 4; + static constexpr int NUMBER_ERRORS_DECIMAL_PLACES = 2; + + // 根据每个read的key,在recalTable中添加对应的数据 + static void IncrementDatum3keys(Array3D table, uint8_t qual, double isError, int key0, int key1, int key2); + static void IncrementDatum4keys(Array4D table, uint8_t qual, double isError, int key0, int key1, int key2, int key3); + + // 输出bqsr报告 + static void outputRecalibrationReport(const BQSRArg& RAC, const QuantizationInfo& quantInfo, const RecalTables& recalTables) { + spdlog::info("output tables"); + // 输出文件 + FILE* fpout = fopen(RAC.OUTPUT_FILE.c_str(), "w"); + if (!fpout) { + fprintf(stderr, "Failed to open file %s.\n", RAC.OUTPUT_FILE.c_str()); + exit(1); + } + // 输出version信息 + fprintf(fpout, "%s\n", REPORT_HEADER_VERSION); + // 输出参数信息 + outputArgsTable(RAC, fpout); + // 输出量化质量分数信息 + outputQuantTable(quantInfo, fpout); + // 输出协变量信息 + outputCovariateTable(recalTables, fpout); + + fclose(fpout); + } + + // 输出运行时使用的参数信息 + static void outputArgsTable(const BQSRArg & p, FILE * fpout) { + ReportTable table("Arguments", "Recalibration argument collection values used in this run"); + table.addColumn({"Argument", "%s"}); + table.addColumn({"Value", ""}); + // 添加行数据 + table.addRowData({"covariate", "ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate"}); + table.addRowData({"no_standard_covs", ReportUtil::ToString(p.DO_NOT_USE_STANDARD_COVARIATES)}); + table.addRowData({"run_without_dbsnp", ReportUtil::ToString(p.RUN_WITHOUT_DBSNP)}); + table.addRowData({"solid_recal_mode", ReportUtil::ToString(p.SOLID_RECAL_MODE)}); + table.addRowData({"solid_nocall_strategy", ReportUtil::ToString(p.SOLID_NOCALL_STRATEGY)}); + table.addRowData({"mismatches_context_size", ReportUtil::ToString(p.MISMATCHES_CONTEXT_SIZE)}); + table.addRowData({"indels_context_size", ReportUtil::ToString(p.INDELS_CONTEXT_SIZE)}); + table.addRowData({"mismatches_default_quality", ReportUtil::ToString(p.MISMATCHES_DEFAULT_QUALITY)}); + table.addRowData({"deletions_default_quality", ReportUtil::ToString(p.DELETIONS_DEFAULT_QUALITY)}); + table.addRowData({"insertions_default_quality", ReportUtil::ToString(p.INSERTIONS_DEFAULT_QUALITY)}); + table.addRowData({"maximum_cycle_value", ReportUtil::ToString(p.MAXIMUM_CYCLE_VALUE)}); + table.addRowData({"low_quality_tail", ReportUtil::ToString(p.LOW_QUAL_TAIL)}); + table.addRowData({"default_platform", ReportUtil::ToString(p.DEFAULT_PLATFORM)}); + table.addRowData({"force_platform", ReportUtil::ToString(p.FORCE_PLATFORM)}); + table.addRowData({"quantizing_levels", ReportUtil::ToString(p.QUANTIZING_LEVELS)}); + table.addRowData({"recalibration_report", ReportUtil::ToString(p.existingRecalibrationReport)}); + table.addRowData({"binary_tag_name", ReportUtil::ToString(p.BINARY_TAG_NAME)}); + // 输出到文件 + table.write(fpout); + } + + // 输出量化质量分数table + static void outputQuantTable(const QuantizationInfo& q, FILE* fpout) { + ReportTable table("Quantized", "Quality quantization map"); + table.addColumn({"QualityScore", "%d"}); + table.addColumn({"Count", "%d"}); + table.addColumn({"QuantizedScore", "%d"}); + + for (int qual = 0; qual <= QualityUtils::MAX_SAM_QUAL_SCORE; ++qual) { + table.addRowData({ReportUtil::ToString(qual), ReportUtil::ToString(q.empiricalQualCounts[qual]), ReportUtil::ToString(q.quantizedQuals[qual])}); + } + + table.write(fpout); + } + + // 输出协变量table + static void outputCovariateTable(const RecalTables &r, FILE* fpout) { + // 1. read group covariates + outputReadGroupTable(r, fpout); + // 2. quality score covariates + outputQualityScoreTable(r, fpout); + // 3. context and cycle covariates + outputContextCycleTable(r, fpout); + } + + // read group table + static void outputReadGroupTable(const RecalTables& r, FILE* fpout) { + ReportTable table("RecalTable0"); + table.addColumn({"ReadGroup", "%s"}); + table.addColumn({"EventType", "%s"}); + table.addColumn({"EmpiricalQuality", "%.4f"}); + table.addColumn({"EstimatedQReported", "%.4f"}); + table.addColumn({"Observations", "%d"}); + table.addColumn({"Errors", "%.2f"}); + + spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches, + r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality); + + _Foreach2DK(r.readGroupTable, datum, { + RecalDatum &dat = const_cast(datum); + spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches()); + spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations()); + if (dat.getNumObservations() > 0) { + table.addRowData({ + ReadGroupCovariate::IdToRg[k1], + ReportUtil::ToString(EventType::EVENTS[k2].representation), + ReportUtil::ToString(dat.getEmpiricalQuality(), 4), + ReportUtil::ToString(dat.getReportedQuality(), 4), + ReportUtil::ToString(dat.getNumObservations()), + ReportUtil::ToString(dat.getNumMismatches(), 2) + }); + } + }); + table.write(fpout); + } + + // quality table + static void outputQualityScoreTable(const RecalTables& r, FILE* fpout) { + ReportTable table("RecalTable1"); + table.addColumn({"ReadGroup", "%s"}); + table.addColumn({"QualityScore", "%d"}); + table.addColumn({"EventType", "%s"}); + table.addColumn({"EmpiricalQuality", "%.4f"}); + table.addColumn({"Observations", "%d"}); + table.addColumn({"Errors", "%.2f"}); + + _Foreach3DK(r.qualityScoreTable, datum, { + RecalDatum &dat = const_cast(datum); + if (dat.getNumObservations() > 0) { + table.addRowData({ + ReadGroupCovariate::IdToRg[k1], + ReportUtil::ToString(k2), + ReportUtil::ToString(EventType::EVENTS[k3].representation), + ReportUtil::ToString(dat.getEmpiricalQuality(), 4), + ReportUtil::ToString(dat.getNumObservations()), + ReportUtil::ToString(dat.getNumMismatches(), 2) + }); + } + }); + table.write(fpout); + } + + static void outputContextCycleTable(const RecalTables& r, FILE* fpout) { + ReportTable table("RecalTable2"); + table.addColumn({"ReadGroup", "%s"}); + table.addColumn({"QualityScore", "%d"}); + table.addColumn({"CovariateValue", "%s"}); + table.addColumn({"CovariateName", "%s"}); + table.addColumn({"EventType", "%s"}); + table.addColumn({"EmpiricalQuality", "%.4f"}); + table.addColumn({"Observations", "%d"}); + table.addColumn({"Errors", "%.2f"}); + + _Foreach4DK(r.contextTable, datum, { + RecalDatum &dat = const_cast(datum); + if (dat.getNumObservations() > 0) { + table.addRowData({ + ReadGroupCovariate::IdToRg[k1], + ReportUtil::ToString(k2), + ReportUtil::ToString(ContextCovariate::ContextFromKey(k3)), + "Context", + ReportUtil::ToString(EventType::EVENTS[k4].representation), + ReportUtil::ToString(dat.getEmpiricalQuality(), 4), + ReportUtil::ToString(dat.getNumObservations()), + ReportUtil::ToString(dat.getNumMismatches(), 2) + }); + } + }); + + _Foreach4DK(r.cycleTable, datum, { + RecalDatum &dat = const_cast(datum); + if (dat.getNumObservations() > 0) { + table.addRowData({ + ReadGroupCovariate::IdToRg[k1], + ReportUtil::ToString(k2), + ReportUtil::ToString(CycleCovariate::CycleFromKey(k3)), + "Cycle", + ReportUtil::ToString(EventType::EVENTS[k4].representation), + ReportUtil::ToString(dat.getEmpiricalQuality(), 4), + ReportUtil::ToString(dat.getNumObservations()), + ReportUtil::ToString(dat.getNumMismatches(), 2) + }); + } + }); + table.write(fpout); + } +}; \ No newline at end of file diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index 9175f15..611b67d 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -42,6 +42,12 @@ struct FastArray { size_t idx; void clear() { idx = 0; } size_t size() { return idx; } + bool empty() { return idx == 0; } + void reserve(size_t _size) { arr.reserve(_size); } + void resize(size_t _size) { + arr.resize(_size); + idx = _size; + } void push_back(const T& val) { if (idx < arr.size()) { arr[idx++] = val; @@ -50,6 +56,7 @@ struct FastArray { idx++; } } + inline T& operator[](size_t pos) { return arr[pos]; } struct iterator { typename std::vector::iterator it; iterator(typename std::vector::iterator _it) : it(_it) {} @@ -65,6 +72,7 @@ struct FastArray { }; // 对原始bam数据的补充,比如对两端进行hardclip等 +class BamWrap; struct SamData { int read_len = 0; // read长度,各种clip之后的长度 int cigar_start = 0; // cigar起始位置,闭区间 @@ -77,38 +85,43 @@ struct SamData { // 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑 static constexpr int READ_INDEX_NOT_FOUND = -1; - string bases; // 处理之后的read的碱基 - vector quals; // 对应的质量分数 - int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间 - int64_t end_pos; // 同上,闭区间 - FastArray cigars; - int64_t& softStart() { return start_pos; } - int64_t& softEnd() { return end_pos; } - // functions - ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) { - ReadIdxCigar rc; - if (refPos < start_pos) - return rc; - int firstReadPosOfElement = 0; // inclusive - int firstRefPosOfElement = start_pos; // inclusive - int lastReadPosOfElement = 0; // exclusive - int lastRefPosOfElement = start_pos; // exclusive - // advance forward through all the cigar elements until we bracket the reference coordinate - for (auto& cigar : cigars) { - firstReadPosOfElement = lastReadPosOfElement; - firstRefPosOfElement = lastRefPosOfElement; - lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0; - lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0; - if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element - int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0); - rc.cigarOp = cigar.op; - rc.readIdx = readPosAtRefCoord; - return rc; - } - } + BamWrap* bw; + int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间 + int64_t end_pos; // 同上,闭区间 + string bases; // 处理之后的read的碱基 + FastArray base_quals; // 对应的质量分数 + FastArray ins_quals; // insert质量分数, BI (大部分应该都没有) + FastArray del_quals; // delete质量分数, BD (大部分应该都没有) + + FastArray cigars; + int64_t& softStart() { return start_pos; } + int64_t& softEnd() { return end_pos; } + + // functions + ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) { + ReadIdxCigar rc; + if (refPos < start_pos) return rc; + int firstReadPosOfElement = 0; // inclusive + int firstRefPosOfElement = start_pos; // inclusive + int lastReadPosOfElement = 0; // exclusive + int lastRefPosOfElement = start_pos; // exclusive + // advance forward through all the cigar elements until we bracket the reference coordinate + for (auto& cigar : cigars) { + firstReadPosOfElement = lastReadPosOfElement; + firstRefPosOfElement = lastRefPosOfElement; + lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0; + lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0; + if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element + int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0); + rc.cigarOp = cigar.op; + rc.readIdx = readPosAtRefCoord; + return rc; + } } + return rc; + } }; /* diff --git a/src/util/math/binomial_dist.h b/src/util/math/binomial_dist.h new file mode 100644 index 0000000..0ec4870 --- /dev/null +++ b/src/util/math/binomial_dist.h @@ -0,0 +1,39 @@ +/* + Description: 二项式分布相关函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include +#include +#include +#include "saddle_pe.h" + +struct BinomialDistribution { + /** The number of trials. */ + int numberOfTrials; + /** The probability of success. */ + double probabilityOfSuccess; + + BinomialDistribution(int trials, double p) { + probabilityOfSuccess = p; + numberOfTrials = trials; + } + + double logProbability(int x) { + if (numberOfTrials == 0) { + return (x == 0) ? 0. : -std::numeric_limits::infinity(); + } + double ret; + if (x < 0 || x > numberOfTrials) { + ret = -std::numeric_limits::infinity(); + } else { + ret = SaddlePointExpansion::logBinomialProbability(x, numberOfTrials, probabilityOfSuccess, 1.0 - probabilityOfSuccess); + } + return ret; + } +}; diff --git a/src/util/math/continued_frac.h b/src/util/math/continued_frac.h new file mode 100644 index 0000000..e5fc0fe --- /dev/null +++ b/src/util/math/continued_frac.h @@ -0,0 +1,162 @@ +/* + Description: ContinuedFraction + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/25 +*/ +#pragma once + +#include + +#include +#include + +#include "math_const.h" +#include "precision.h" + +//using ContinuedFractionGetFunc = std::function(); +// using ContinuedFractionGetFunc = double (*)(int n, double x); + +struct ContinuedFraction { + /** Maximum allowed numerical error. */ + static constexpr double DEFAULT_EPSILON = 10e-9; + + // ContinuedFraction(ContinuedFractionGetFunc _getA, ContinuedFractionGetFunc _getB) { + // getA = _getA; + // getB = _getB; + // } + /** + * Access the n-th a coefficient of the continued fraction. Since a can be + * a function of the evaluation point, x, that is passed in as well. + * @param n the coefficient index to retrieve. + * @param x the evaluation point. + * @return the n-th a coefficient. + */ + //ContinuedFractionGetFunc getA; + virtual double getA(int n, double x) = 0; + + /** + * Access the n-th b coefficient of the continued fraction. Since b can be + * a function of the evaluation point, x, that is passed in as well. + * @param n the coefficient index to retrieve. + * @param x the evaluation point. + * @return the n-th b coefficient. + */ + //ContinuedFractionGetFunc getB; + virtual double getB(int n, double x) = 0; + + /** + * Evaluates the continued fraction at the value x. + * @param x the evaluation point. + * @return the value of the continued fraction evaluated at x. + * @throws ConvergenceException if the algorithm fails to converge. + */ + double evaluate(double x) { return evaluate(x, DEFAULT_EPSILON, MathConst::INT_MAX_VALUE); } + + /** + * Evaluates the continued fraction at the value x. + * @param x the evaluation point. + * @param epsilon maximum error allowed. + * @return the value of the continued fraction evaluated at x. + * @throws ConvergenceException if the algorithm fails to converge. + */ + double evaluate(double x, double epsilon) { return evaluate(x, epsilon, MathConst::INT_MAX_VALUE); } + + /** + * Evaluates the continued fraction at the value x. + * @param x the evaluation point. + * @param maxIterations maximum number of convergents + * @return the value of the continued fraction evaluated at x. + * @throws ConvergenceException if the algorithm fails to converge. + * @throws MaxCountExceededException if maximal number of iterations is reached + */ + double evaluate(double x, int maxIterations) { + return evaluate(x, DEFAULT_EPSILON, maxIterations); + } + + /** + * Evaluates the continued fraction at the value x. + *

+ * The implementation of this method is based on the modified Lentz algorithm as described + * on page 18 ff. in: + *

+ * Note: the implementation uses the terms ai and bi as defined in + * Continued Fraction @ MathWorld. + *

+ * + * @param x the evaluation point. + * @param epsilon maximum error allowed. + * @param maxIterations maximum number of convergents + * @return the value of the continued fraction evaluated at x. + * @throws ConvergenceException if the algorithm fails to converge. + * @throws MaxCountExceededException if maximal number of iterations is reached + */ + double evaluate(double x, double epsilon, int maxIterations) { + const double small = 1e-50; + double hPrev = getA(0, x); + + // use the value of small as epsilon criteria for zero checks + if (Precision::equals(hPrev, 0.0, small)) { + hPrev = small; + } + + int n = 1; + double dPrev = 0.0; + double cPrev = hPrev; + double hN = hPrev; + + while (n < maxIterations) { + const double a = getA(n, x); + const double b = getB(n, x); + + double dN = a + b * dPrev; + if (Precision::equals(dN, 0.0, small)) { + dN = small; + } + double cN = a + b / cPrev; + if (Precision::equals(cN, 0.0, small)) { + cN = small; + } + + dN = 1 / dN; + const double deltaN = cN * dN; + hN = hPrev * deltaN; + + if (std::isinf(hN)) { + std::cerr << "ConvergenceException INFINITY" << std::endl; + exit(1); + // throw new ConvergenceException(LocalizedFormats.CONTINUED_FRACTION_INFINITY_DIVERGENCE, x); + } + if (std::isnan(hN)) { + std::cerr << "ConvergenceException NAN" << std::endl; + exit(1); + // throw new ConvergenceException(LocalizedFormats.CONTINUED_FRACTION_NAN_DIVERGENCE, x); + } + + if (std::abs(deltaN - 1.0) < epsilon) { + break; + } + + dPrev = dN; + cPrev = cN; + hPrev = hN; + n++; + } + + if (n >= maxIterations) { + std::cerr << "MaxCountExceededException max iterations" << std::endl; + exit(1); + // throw new MaxCountExceededException(LocalizedFormats.NON_CONVERGENT_CONTINUED_FRACTION, maxIterations, x); + } + + return hN; + } +}; \ No newline at end of file diff --git a/src/util/math/gamma.cpp b/src/util/math/gamma.cpp new file mode 100644 index 0000000..401adaf --- /dev/null +++ b/src/util/math/gamma.cpp @@ -0,0 +1,11 @@ +#include "gamma.h" + +const double Gamma::LANCZOS[] = { + 0.99999999999999709182, 57.156235665862923517, -59.597960355475491248, 14.136097974741747174, -0.49191381609762019978, + .33994649984811888699e-4, .46523628927048575665e-4, -.98374475304879564677e-4, .15808870322491248884e-3, -.21026444172410488319e-3, + .21743961811521264320e-3, -.16431810653676389022e-3, .84418223983852743293e-4, -.26190838401581408670e-4, .36899182659531622704e-5, +}; + +const int Gamma::LANCZOS_LEN = sizeof(LANCZOS) / sizeof(LANCZOS[0]); + +const double Gamma::HALF_LOG_2_PI = 0.5 * std::log(TWO_PI); \ No newline at end of file diff --git a/src/util/math/gamma.h b/src/util/math/gamma.h new file mode 100644 index 0000000..c5c453b --- /dev/null +++ b/src/util/math/gamma.h @@ -0,0 +1,644 @@ +/* + Description: Gamma相关的工具函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/25 +*/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "math_const.h" +#include "continued_frac.h" + +struct Gamma { + /** + * Euler-Mascheroni constant + * @since 2.0 + */ + static constexpr double GAMMA = 0.577215664901532860606512090082; + + static constexpr double TWO_PI = 2 * (105414357.0 / 33554432.0 + 1.984187159361080883e-9); + + /** + * The value of the {@code g} constant in the Lanczos approximation, see + * {@link #lanczos(double)}. + * @since 3.1 + */ + static constexpr double LANCZOS_G = 607.0 / 128.0; + + /** Maximum allowed numerical error. */ + static constexpr double DEFAULT_EPSILON = 10e-15; + + /** Lanczos coefficients */ + static const double LANCZOS[]; + static const int LANCZOS_LEN; + + /** Avoid repeated computation of log of 2 PI in logGamma */ + + static const double HALF_LOG_2_PI; + + /** The constant value of √(2π). */ + + static constexpr double SQRT_TWO_PI = 2.506628274631000502; + + // limits for switching algorithm in digamma + /** C limit. */ + + static constexpr double C_LIMIT = 49; + + /** S limit. */ + + static constexpr double S_LIMIT = 1e-5; + + /* + * Constants for the computation of double invGamma1pm1(double). + * Copied from DGAM1 in the NSWC library. + */ + + /** The constant {@code A0} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_A0 = .611609510448141581788E-08; + + /** The constant {@code A1} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_A1 = .624730830116465516210E-08; + + /** The constant {@code B1} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B1 = .203610414066806987300E+00; + + /** The constant {@code B2} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B2 = .266205348428949217746E-01; + + /** The constant {@code B3} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B3 = .493944979382446875238E-03; + + /** The constant {@code B4} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B4 = -.851419432440314906588E-05; + + /** The constant {@code B5} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B5 = -.643045481779353022248E-05; + + /** The constant {@code B6} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B6 = .992641840672773722196E-06; + + /** The constant {@code B7} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B7 = -.607761895722825260739E-07; + + /** The constant {@code B8} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_B8 = .195755836614639731882E-09; + + /** The constant {@code P0} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P0 = .6116095104481415817861E-08; + + /** The constant {@code P1} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P1 = .6871674113067198736152E-08; + + /** The constant {@code P2} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P2 = .6820161668496170657918E-09; + + /** The constant {@code P3} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P3 = .4686843322948848031080E-10; + + /** The constant {@code P4} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P4 = .1572833027710446286995E-11; + + /** The constant {@code P5} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P5 = -.1249441572276366213222E-12; + + /** The constant {@code P6} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_P6 = .4343529937408594255178E-14; + + /** The constant {@code Q1} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_Q1 = .3056961078365221025009E+00; + + /** The constant {@code Q2} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_Q2 = .5464213086042296536016E-01; + + /** The constant {@code Q3} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_Q3 = .4956830093825887312020E-02; + + /** The constant {@code Q4} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_Q4 = .2692369466186361192876E-03; + + /** The constant {@code C} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C = -.422784335098467139393487909917598E+00; + + /** The constant {@code C0} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C0 = .577215664901532860606512090082402E+00; + + /** The constant {@code C1} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C1 = -.655878071520253881077019515145390E+00; + + /** The constant {@code C2} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C2 = -.420026350340952355290039348754298E-01; + + /** The constant {@code C3} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C3 = .166538611382291489501700795102105E+00; + + /** The constant {@code C4} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C4 = -.421977345555443367482083012891874E-01; + + /** The constant {@code C5} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C5 = -.962197152787697356211492167234820E-02; + + /** The constant {@code C6} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C6 = .721894324666309954239501034044657E-02; + + /** The constant {@code C7} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C7 = -.116516759185906511211397108401839E-02; + + /** The constant {@code C8} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C8 = -.215241674114950972815729963053648E-03; + + /** The constant {@code C9} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C9 = .128050282388116186153198626328164E-03; + + /** The constant {@code C10} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C10 = -.201348547807882386556893914210218E-04; + + /** The constant {@code C11} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C11 = -.125049348214267065734535947383309E-05; + + /** The constant {@code C12} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C12 = .113302723198169588237412962033074E-05; + + /** The constant {@code C13} defined in {@code DGAM1}. */ + static constexpr double INV_GAMMA1P_M1_C13 = -.205633841697760710345015413002057E-06; + + /** + * Default constructor. Prohibit instantiation. + */ +private: + Gamma() {} + +public: + + /** + *

+ * Returns the value of log Γ(x) for x > 0. + *

+ *

+ * For x ≤ 8, the implementation is based on the double precision + * implementation in the NSWC Library of Mathematics Subroutines, + * {@code DGAMLN}. For x > 8, the implementation is based on + *

+ * + * + * @param x Argument. + * @return the value of {@code log(Gamma(x))}, {@code Double.NaN} if + * {@code x <= 0.0}. + */ + static double logGamma(double x) { + double ret; + + if (std::isnan(x) || (x <= 0.0)) { + ret = MathConst::DOUBLE_NAN; + } else if (x < 0.5) { + return logGamma1p(x) - std::log(x); + } else if (x <= 2.5) { + return logGamma1p((x - 0.5) - 0.5); + } else if (x <= 8.0) { + const int n = (int)std::floor(x - 1.5); + double prod = 1.0; + for (int i = 1; i <= n; i++) { + prod *= x - i; + } + return logGamma1p(x - (n + 1)) + std::log(prod); + } else { + double sum = lanczos(x); + double tmp = x + LANCZOS_G + .5; + ret = ((x + .5) * std::log(tmp)) - tmp + HALF_LOG_2_PI + std::log(sum / x); + } + + return ret; + } + + /** + * Returns the regularized gamma function P(a, x). + * + * @param a Parameter. + * @param x Value. + * @return the regularized gamma function P(a, x). + * @throws MaxCountExceededException if the algorithm fails to converge. + */ + static double regularizedGammaP(double a, double x) { return regularizedGammaP(a, x, DEFAULT_EPSILON, INT_MAX); } + + /** + * Returns the regularized gamma function P(a, x). + * + * The implementation of this method is based on: + * + * + * @param a the a parameter. + * @param x the value. + * @param epsilon When the absolute value of the nth item in the + * series is less than epsilon the approximation ceases to calculate + * further elements in the series. + * @param maxIterations Maximum number of "iterations" to complete. + * @return the regularized gamma function P(a, x) + * @throws MaxCountExceededException if the algorithm fails to converge. + */ + static double regularizedGammaP(double a, double x, double epsilon, int maxIterations) { + double ret; + + if (std::isnan(a) || std::isnan(x) || (a <= 0.0) || (x < 0.0)) { + ret = std::numeric_limits::quiet_NaN(); + } else if (x == 0.0) { + ret = 0.0; + } else if (x >= a + 1) { + // use regularizedGammaQ because it should converge faster in this + // case. + ret = 1.0 - regularizedGammaQ(a, x, epsilon, maxIterations); + } else { + // calculate series + double n = 0.0; // current element index + double an = 1.0 / a; // n-th element in the series + double sum = an; // partial sum + while (std::abs(an / sum) > epsilon && n < maxIterations && sum < std::numeric_limits::infinity()) { + // compute next element in the series + n += 1.0; + an *= x / (a + n); + + // update partial sum + sum += an; + } + if (n >= maxIterations) { + std::cerr << "Reach max iterations!" << std::endl; + exit(1); + } else if (std::isinf(sum)) { + ret = 1.0; + } else { + ret = std::exp(-x + (a * std::log(x)) - logGamma(a)) * sum; + } + } + return ret; + } + + /** + * Returns the regularized gamma function Q(a, x) = 1 - P(a, x). + * + * @param a the a parameter. + * @param x the value. + * @return the regularized gamma function Q(a, x) + * @throws MaxCountExceededException if the algorithm fails to converge. + */ + static double regularizedGammaQ(double a, double x) { return regularizedGammaQ(a, x, DEFAULT_EPSILON, INT_MAX); } + + struct ContinuedFractionInGamma : ContinuedFraction { + double a; + ContinuedFractionInGamma(double _a) : a(_a) {} + double getA(int n, double x) { return ((2.0 * n) + 1.0) - a + x; }; + double getB(int n, double x) { return n * (a - n); }; + }; + /** + * Returns the regularized gamma function Q(a, x) = 1 - P(a, x). + * + * The implementation of this method is based on: + * + * + * @param a the a parameter. + * @param x the value. + * @param epsilon When the absolute value of the nth item in the + * series is less than epsilon the approximation ceases to calculate + * further elements in the series. + * @param maxIterations Maximum number of "iterations" to complete. + * @return the regularized gamma function P(a, x) + * @throws MaxCountExceededException if the algorithm fails to converge. + */ + static double regularizedGammaQ(const double a, double x, double epsilon, int maxIterations) { + double ret; + + if (std::isnan(a) || std::isnan(x) || (a <= 0.0) || (x < 0.0)) { + ret = MathConst::DOUBLE_NAN; + } else if (x == 0.0) { + ret = 1.0; + } else if (x < a + 1.0) { + // use regularizedGammaP because it should converge faster in this + // case. + ret = 1.0 - regularizedGammaP(a, x, epsilon, maxIterations); + } else { + // create continued fraction + ContinuedFractionInGamma cf(a); + ret = 1.0 / cf.evaluate(x, epsilon, maxIterations); + ret = std::exp(-x + (a * std::log(x)) - logGamma(a)) * ret; + } + + return ret; + } + + /** + *

Computes the digamma function of x.

+ * + *

This is an independently written implementation of the algorithm described in + * Jose Bernardo, Algorithm AS 103: Psi (Digamma) Function, Applied Statistics, 1976.

+ * + *

Some of the constants have been changed to increase accuracy at the moderate expense + * of run-time. The result should be accurate to within 10^-8 absolute tolerance for + * x >= 10^-5 and within 10^-8 relative tolerance for x > 0.

+ * + *

Performance for large negative values of x will be quite expensive (proportional to + * |x|). Accuracy for negative values of x should be about 10^-8 absolute for results + * less than 10^5 and 10^-8 relative for results larger than that.

+ * + * @param x Argument. + * @return digamma(x) to within 10-8 relative or absolute error whichever is smaller. + * @see Digamma + * @see Bernardo's original article + * @since 2.0 + */ + static double digamma(double x) { + if (x > 0 && x <= S_LIMIT) { + // use method 5 from Bernardo AS103 + // accurate to O(x) + return -GAMMA - 1 / x; + } + + if (x >= C_LIMIT) { + // use method 4 (accurate to O(1/x^8) + double inv = 1 / (x * x); + // 1 1 1 1 + // log(x) - --- - ------ + ------- - ------- + // 2 x 12 x^2 120 x^4 252 x^6 + return std::log(x) - 0.5 / x - inv * ((1.0 / 12) + inv * (1.0 / 120 - inv / 252)); + } + + return digamma(x + 1) - 1 / x; + } + + /** + * Computes the trigamma function of x. + * This function is derived by taking the derivative of the implementation + * of digamma. + * + * @param x Argument. + * @return trigamma(x) to within 10-8 relative or absolute error whichever is smaller + * @see Trigamma + * @see Gamma#digamma(double) + * @since 2.0 + */ + static double trigamma(double x) { + if (x > 0 && x <= S_LIMIT) { + return 1 / (x * x); + } + + if (x >= C_LIMIT) { + double inv = 1 / (x * x); + // 1 1 1 1 1 + // - + ---- + ---- - ----- + ----- + // x 2 3 5 7 + // 2 x 6 x 30 x 42 x + return 1 / x + inv / 2 + inv / x * (1.0 / 6 - inv * (1.0 / 30 + inv / 42)); + } + + return trigamma(x + 1) + 1 / (x * x); + } + + /** + *

+ * Returns the Lanczos approximation used to compute the gamma function. + * The Lanczos approximation is related to the Gamma function by the + * following equation + *

+ * {@code gamma(x) = sqrt(2 * pi) / x * (x + g + 0.5) ^ (x + 0.5) + * * exp(-x - g - 0.5) * lanczos(x)}, + *
+ * where {@code g} is the Lanczos constant. + *

+ * + * @param x Argument. + * @return The Lanczos approximation. + * @see Lanczos Approximation + * equations (1) through (5), and Paul Godfrey's + * Note on the computation + * of the convergent Lanczos complex Gamma approximation + * @since 3.1 + */ + static double lanczos(const double x) { + double sum = 0.0; + for (int i = LANCZOS_LEN - 1; i > 0; --i) { + sum += LANCZOS[i] / (x + i); + } + return sum + LANCZOS[0]; + } + + /** + * Returns the value of 1 / Γ(1 + x) - 1 for -0.5 ≤ x ≤ + * 1.5. This implementation is based on the double precision + * implementation in the NSWC Library of Mathematics Subroutines, + * {@code DGAM1}. + * + * @param x Argument. + * @return The value of {@code 1.0 / Gamma(1.0 + x) - 1.0}. + * @throws NumberIsTooSmallException if {@code x < -0.5} + * @throws NumberIsTooLargeException if {@code x > 1.5} + * @since 3.1 + */ + static double invGamma1pm1(const double x) { + // if (x < -0.5) { + // throw new NumberIsTooSmallException(x, -0.5, true); + // } + // if (x > 1.5) { + // throw new NumberIsTooLargeException(x, 1.5, true); + // } + assert(x >= -0.5 && x <= 1.5); + + double ret; + double t = x <= 0.5 ? x : (x - 0.5) - 0.5; + if (t < 0.0) { + double a = INV_GAMMA1P_M1_A0 + t * INV_GAMMA1P_M1_A1; + double b = INV_GAMMA1P_M1_B8; + b = INV_GAMMA1P_M1_B7 + t * b; + b = INV_GAMMA1P_M1_B6 + t * b; + b = INV_GAMMA1P_M1_B5 + t * b; + b = INV_GAMMA1P_M1_B4 + t * b; + b = INV_GAMMA1P_M1_B3 + t * b; + b = INV_GAMMA1P_M1_B2 + t * b; + b = INV_GAMMA1P_M1_B1 + t * b; + b = 1.0 + t * b; + + double c = INV_GAMMA1P_M1_C13 + t * (a / b); + c = INV_GAMMA1P_M1_C12 + t * c; + c = INV_GAMMA1P_M1_C11 + t * c; + c = INV_GAMMA1P_M1_C10 + t * c; + c = INV_GAMMA1P_M1_C9 + t * c; + c = INV_GAMMA1P_M1_C8 + t * c; + c = INV_GAMMA1P_M1_C7 + t * c; + c = INV_GAMMA1P_M1_C6 + t * c; + c = INV_GAMMA1P_M1_C5 + t * c; + c = INV_GAMMA1P_M1_C4 + t * c; + c = INV_GAMMA1P_M1_C3 + t * c; + c = INV_GAMMA1P_M1_C2 + t * c; + c = INV_GAMMA1P_M1_C1 + t * c; + c = INV_GAMMA1P_M1_C + t * c; + if (x > 0.5) { + ret = t * c / x; + } else { + ret = x * ((c + 0.5) + 0.5); + } + } else { + double p = INV_GAMMA1P_M1_P6; + p = INV_GAMMA1P_M1_P5 + t * p; + p = INV_GAMMA1P_M1_P4 + t * p; + p = INV_GAMMA1P_M1_P3 + t * p; + p = INV_GAMMA1P_M1_P2 + t * p; + p = INV_GAMMA1P_M1_P1 + t * p; + p = INV_GAMMA1P_M1_P0 + t * p; + + double q = INV_GAMMA1P_M1_Q4; + q = INV_GAMMA1P_M1_Q3 + t * q; + q = INV_GAMMA1P_M1_Q2 + t * q; + q = INV_GAMMA1P_M1_Q1 + t * q; + q = 1.0 + t * q; + + double c = INV_GAMMA1P_M1_C13 + (p / q) * t; + c = INV_GAMMA1P_M1_C12 + t * c; + c = INV_GAMMA1P_M1_C11 + t * c; + c = INV_GAMMA1P_M1_C10 + t * c; + c = INV_GAMMA1P_M1_C9 + t * c; + c = INV_GAMMA1P_M1_C8 + t * c; + c = INV_GAMMA1P_M1_C7 + t * c; + c = INV_GAMMA1P_M1_C6 + t * c; + c = INV_GAMMA1P_M1_C5 + t * c; + c = INV_GAMMA1P_M1_C4 + t * c; + c = INV_GAMMA1P_M1_C3 + t * c; + c = INV_GAMMA1P_M1_C2 + t * c; + c = INV_GAMMA1P_M1_C1 + t * c; + c = INV_GAMMA1P_M1_C0 + t * c; + + if (x > 0.5) { + ret = (t / x) * ((c - 0.5) - 0.5); + } else { + ret = x * c; + } + } + + return ret; + } + + /** + * Returns the value of log Γ(1 + x) for -0.5 ≤ x ≤ 1.5. + * This implementation is based on the double precision implementation in + * the NSWC Library of Mathematics Subroutines, {@code DGMLN1}. + * + * @param x Argument. + * @return The value of {@code log(Gamma(1 + x))}. + * @throws NumberIsTooSmallException if {@code x < -0.5}. + * @throws NumberIsTooLargeException if {@code x > 1.5}. + * @since 3.1 + */ + static double logGamma1p(const double x) { + // if (x < -0.5) { + // throw new NumberIsTooSmallException(x, -0.5, true); + // } + // if (x > 1.5) { + // throw new NumberIsTooLargeException(x, 1.5, true); + // } + assert(x >= -0.5 && x <= 1.5); + return -std::log1p(invGamma1pm1(x)); + } + + /** + * Returns the value of Γ(x). Based on the NSWC Library of + * Mathematics Subroutines double precision implementation, + * {@code DGAMMA}. + * + * @param x Argument. + * @return the value of {@code Gamma(x)}. + * @since 3.1 + */ + static double gamma(const double x) { + if ((x == std::rint(x)) && (x <= 0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + double ret; + const double absX = std::abs(x); + if (absX <= 20.0) { + if (x >= 1.0) { + /* + * From the recurrence relation + * Gamma(x) = (x - 1) * ... * (x - n) * Gamma(x - n), + * then + * Gamma(t) = 1 / [1 + invGamma1pm1(t - 1)], + * where t = x - n. This means that t must satisfy + * -0.5 <= t - 1 <= 1.5. + */ + double prod = 1.0; + double t = x; + while (t > 2.5) { + t -= 1.0; + prod *= t; + } + ret = prod / (1.0 + invGamma1pm1(t - 1.0)); + } else { + /* + * From the recurrence relation + * Gamma(x) = Gamma(x + n + 1) / [x * (x + 1) * ... * (x + n)] + * then + * Gamma(x + n + 1) = 1 / [1 + invGamma1pm1(x + n)], + * which requires -0.5 <= x + n <= 1.5. + */ + double prod = x; + double t = x; + while (t < -0.5) { + t += 1.0; + prod *= t; + } + ret = 1.0 / (prod * (1.0 + invGamma1pm1(t))); + } + } else { + const double y = absX + LANCZOS_G + 0.5; + const double gammaAbs = SQRT_TWO_PI / x * std::pow(y, absX + 0.5) * std::exp(-y) * lanczos(absX); + if (x > 0.0) { + ret = gammaAbs; + } else { + /* + * From the reflection formula + * Gamma(x) * Gamma(1 - x) * sin(pi * x) = pi, + * and the recurrence relation + * Gamma(1 - x) = -x * Gamma(-x), + * it is found + * Gamma(x) = -pi / [x * sin(pi * x) * Gamma(-x)]. + */ + ret = -MathConst::PI / (x * std::sin(MathConst::PI * x) * gammaAbs); + } + } + return ret; + } +}; diff --git a/src/util/math/math_const.h b/src/util/math/math_const.h new file mode 100644 index 0000000..8477aec --- /dev/null +++ b/src/util/math/math_const.h @@ -0,0 +1,26 @@ +/* + Description: 数学常用的一些常数,如pi,e等 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include +#include + +struct MathConst { + static constexpr double PI = 105414357.0 / 33554432.0 + 1.984187159361080883e-9; + + static constexpr double TWO_PI = 2 * (105414357.0 / 33554432.0 + 1.984187159361080883e-9); + + static constexpr double DOUBLE_NAN = std::numeric_limits::quiet_NaN(); + + static constexpr double DOUBLE_NEGATIVE_INFINITY = -std::numeric_limits::infinity(); + + static constexpr double DOUBLE_POSITIVE_INFINITY = std::numeric_limits::infinity(); + + static constexpr int INT_MAX_VALUE = INT_MAX; +}; \ No newline at end of file diff --git a/src/util/math/math_func.h b/src/util/math/math_func.h new file mode 100644 index 0000000..9e4b021 --- /dev/null +++ b/src/util/math/math_func.h @@ -0,0 +1,41 @@ +/* + Description: 数学常用的工具函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include +#include +#include +#include + +struct MathFunc { + // 1.1 使用memcpy(最安全,无未定义行为) + static int64_t doubleToRawLongBits(double value) { + int64_t result; + std::memcpy(&result, &value, sizeof(result)); + return result; + } + + static double longBitsToDouble(int64_t bits) { + double result; + std::memcpy(&result, &bits, sizeof(result)); + return result; + } + + static int floatToRawIntBits(float value) { + int result; + std::memcpy(&result, &value, sizeof(result)); + return result; + } + + static float intBitsToFloat(int bits) { + float result; + std::memcpy(&result, &bits, sizeof(result)); + return result; + } +}; \ No newline at end of file diff --git a/src/util/math/math_ulp.h b/src/util/math/math_ulp.h new file mode 100644 index 0000000..d709d09 --- /dev/null +++ b/src/util/math/math_ulp.h @@ -0,0 +1,195 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// 2.1 基础模板实现 +template +class MathUlp { + static_assert(std::is_floating_point_v, "MathUlp only works with floating-point types"); + +private: + using UIntType = std::conditional_t, uint32_t, std::conditional_t, uint64_t, void> >; + + static constexpr int BITS = sizeof(T) * 8; + static constexpr int MANTISSA_BITS = std::numeric_limits::digits - 1; + static constexpr int EXPONENT_BITS = BITS - MANTISSA_BITS - 1; + static constexpr UIntType SIGN_BIT = UIntType(1) << (BITS - 1); + static constexpr UIntType EXPONENT_MASK = ((UIntType(1) << EXPONENT_BITS) - 1) << MANTISSA_BITS; + static constexpr UIntType MANTISSA_MASK = (UIntType(1) << MANTISSA_BITS) - 1; + static constexpr int EXPONENT_BIAS = (1 << (EXPONENT_BITS - 1)) - 1; + static constexpr int MAX_EXPONENT = (1 << EXPONENT_BITS) - 1; + + // 按位访问浮点数 + static UIntType get_bits(T value) { + UIntType bits; + std::memcpy(&bits, &value, sizeof(T)); + return bits; + } + + static T from_bits(UIntType bits) { + T value; + std::memcpy(&value, &bits, sizeof(T)); + return value; + } + + // 获取指数部分 + static int get_exponent(T value) { + UIntType bits = get_bits(value); + UIntType exp_bits = (bits >> MANTISSA_BITS) & ((UIntType(1) << EXPONENT_BITS) - 1); + return static_cast(exp_bits) - EXPONENT_BIAS; + } + + // 获取尾数部分 + static UIntType get_mantissa(T value) { + UIntType bits = get_bits(value); + return bits & MANTISSA_MASK; + } + + // 判断是否为规格化数 + static bool is_normal(T value) { + if (value == 0 || std::isinf(value) || std::isnan(value)) + return false; + int exp = get_exponent(value); + return exp > -EXPONENT_BIAS && exp <= EXPONENT_BIAS; + } + + // 判断是否为非规格化数(次正规数) + static bool is_subnormal(T value) { + if (value == 0 || std::isinf(value) || std::isnan(value)) + return false; + int exp = get_exponent(value); + return exp == -EXPONENT_BIAS && get_mantissa(value) != 0; + } + +public: + // 主函数:计算ulp + static T ulp(T x) { + // 处理特殊值 + if (std::isnan(x)) { + return std::numeric_limits::quiet_NaN(); + } + + if (std::isinf(x)) { + return std::numeric_limits::infinity(); + } + + // 处理0 + if (x == 0.0 || x == -0.0) { + return std::numeric_limits::min(); // 最小正规格化数 + } + + // 获取x的绝对值 + T abs_x = std::fabs(x); + + // 对于非规格化数,ulp是固定的 + if (is_subnormal(abs_x)) { + // 非规格化数的ulp是最小正非规格化数 + return std::numeric_limits::denorm_min(); + } + + // 对于规格化数 + int exp = get_exponent(abs_x); + + // ulp = 2^(exp - mantissa_bits) + // 对于float: ulp = 2^(exp - 23) + // 对于double: ulp = 2^(exp - 52) + if constexpr (std::is_same_v) { + // 使用ldexp计算2的幂次 + return std::ldexp(1.0f, exp - 23); + } else if constexpr (std::is_same_v) { + return std::ldexp(1.0, exp - 52); + } else if constexpr (std::is_same_v) { + // long double的实现可能因平台而异 + constexpr int LD_MANTISSA_BITS = std::numeric_limits::digits - 1; + return std::ldexp(1.0L, exp - LD_MANTISSA_BITS); + } + } + + // 使用nextafter的实现(更简单但可能有性能开销) + static T ulp_simple(T x) { + // 处理特殊值 + if (std::isnan(x)) { + return std::numeric_limits::quiet_NaN(); + } + + if (std::isinf(x)) { + return std::numeric_limits::infinity(); + } + + // 处理0 + if (x == 0.0 || x == -0.0) { + return std::numeric_limits::min(); + } + + // 使用nextafter计算ulp + T next = std::nextafter(std::fabs(x), std::numeric_limits::infinity()); + T prev = std::nextafter(std::fabs(x), -std::numeric_limits::infinity()); + + // 取最小的正差值 + T diff1 = next - std::fabs(x); + T diff2 = std::fabs(x) - prev; + + // 返回较小的正差值 + if (diff1 > 0 && diff2 > 0) { + return std::min(diff1, diff2); + } else if (diff1 > 0) { + return diff1; + } else { + return diff2; + } + } + + // 获取浮点数的内部表示信息 + static void print_float_info(T value) { + std::cout << "Value: " << std::setprecision(15) << value << "\n"; + std::cout << "Hex bits: "; + + if constexpr (std::is_same_v) { + uint32_t bits; + std::memcpy(&bits, &value, sizeof(float)); + std::cout << std::hex << bits << std::dec; + std::cout << "\nSign: " << ((bits >> 31) & 1); + std::cout << "\nExponent: " << ((bits >> 23) & 0xFF); + std::cout << " (bias-adjusted: " << ((bits >> 23) & 0xFF) - 127 << ")"; + std::cout << "\nMantissa: " << (bits & 0x7FFFFF); + } else { + uint64_t bits; + std::memcpy(&bits, &value, sizeof(double)); + std::cout << std::hex << bits << std::dec; + std::cout << "\nSign: " << ((bits >> 63) & 1); + std::cout << "\nExponent: " << ((bits >> 52) & 0x7FF); + std::cout << " (bias-adjusted: " << ((bits >> 52) & 0x7FF) - 1023 << ")"; + std::cout << "\nMantissa: " << (bits & 0xFFFFFFFFFFFFF); + } + + std::cout << "\nIs normal: " << std::boolalpha << is_normal(value); + std::cout << "\nIs subnormal: " << is_subnormal(value); + std::cout << "\nULP (our): " << ulp(value); + std::cout << "\nULP (simple): " << ulp_simple(value); + std::cout << "\n---\n"; + } +}; + +// 2.2 用户友好接口 +template +inline T math_ulp(T x) { + return MathUlp::ulp(x); +} + +template +inline T math_ulp_simple(T x) { + return MathUlp::ulp_simple(x); +} + +// 2.3 类型别名 +using Ulp32 = MathUlp; +using Ulp64 = MathUlp; +using Ulp128 = MathUlp; \ No newline at end of file diff --git a/src/util/math/math_utils.h b/src/util/math/math_utils.h new file mode 100644 index 0000000..9a7571d --- /dev/null +++ b/src/util/math/math_utils.h @@ -0,0 +1,138 @@ +/* + Description: 数学计算相关的工具函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/25 +*/ +#pragma once + +#include +#include +#include +#include + +#include "binomial_dist.h" +#include "math_const.h" +#include "math_ulp.h" + +using std::vector; + +struct MathUtils { + + // 初始化math库里所有需要静态初始化的类静态变量 + static void StaticInit() { + + } + + /** + * Rounds the double to the given number of decimal places. + * For example, rounding 3.1415926 to 3 places would give 3.142. + * The requirement is that it works exactly as writing a number down with string.format and reading back in. + */ + static double RoundToNDecimalPlaces(const double in, const int n) { + assert(n > 0); + const double mult = std::pow(10, n); + return std::round((in + Ulp64::ulp(in)) * mult) / mult; + } + + /** + * binomial Probability(int, int, double) with log applied to result + */ + static double logBinomialProbability(const int n, const int k, const double p) { + BinomialDistribution binomial(n, p); + return binomial.logProbability(k); + } + + template + static int maxElementIndex(const ArrayType& array, const int start, const int endIndex) { + int maxI = start; + for (int i = (start + 1); i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + return maxI; + } + + static int fastRound(double d) { return (d > 0.0) ? (int)(d + 0.5) : (int)(d - 0.5); } + + /** + * Compute in a numerically correct way the quantity log10(1-x) + * + * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow + * in 1-x when x is very small + * + * log10(1-x) = log10( x * (1-x) / x ) + * = log10(x) + log10( (1-x) / x) + * = log10(x) + log10(1/x - 1) + * + * @param x a positive double value between 0.0 and 1.0 + * @return an estimate of log10(1-x) + */ + static double log10OneMinusX(const double x) { + if (x == 1.0) + return MathConst::DOUBLE_NEGATIVE_INFINITY; + else if (x == 0.0) + return 0.0; + else { + const double d = std::log10(1 / x - 1) + std::log10(x); + return std::isinf(d) || d > 0.0 ? 0.0 : d; + } + } + + static double log10SumLog10(const vector& log10Values, const int start) { + return log10SumLog10(log10Values, start, log10Values.size()); + } + + static double log10SumLog10(const vector& log10Values) { return log10SumLog10(log10Values, 0); } + + static double log10SumLog10(const vector& log10Values, const int start, const int finish) { + // Utils.nonNull(log10Values); + if (start >= finish) { + return MathConst::DOUBLE_NEGATIVE_INFINITY; + } + const int maxElementIndex = MathUtils::maxElementIndex(log10Values, start, finish); + + + const double maxValue = log10Values[maxElementIndex]; + if (maxValue == MathConst::DOUBLE_NEGATIVE_INFINITY) { + return maxValue; + } + double sum = 1.0; + for (int i = start; i < finish; i++) { + const double curVal = log10Values[i]; + if (i == maxElementIndex || curVal == MathConst::DOUBLE_NEGATIVE_INFINITY) { + continue; + } else { + const double scaled_val = curVal - maxValue; + sum += std::pow(10.0, scaled_val); + } + } + if (std::isinf(sum) || sum == MathConst::DOUBLE_POSITIVE_INFINITY) { + // throw new IllegalArgumentException("log10 p: Values must be non-infinite and non-NAN"); + } + return maxValue + (sum != 1.0 ? std::log10(sum) : 0.0); + } + + static double log10SumLog10(const double a, const double b) { + return a > b ? a + std::log10(1 + std::pow(10.0, b - a)) : b + std::log10(1 + std::pow(10.0, a - b)); + } + + /** + * Do the log-sum trick for three double values. + * @param a + * @param b + * @param c + * @return the sum... perhaps NaN or infinity if it applies. + */ + static double log10SumLog10(const double a, const double b, const double c) { + if (a >= b && a >= c) { + return a + std::log10(1 + std::pow(10.0, b - a) + std::pow(10.0, c - a)); + } else if (b >= c) { + return b + std::log10(1 + std::pow(10.0, a - b) + std::pow(10.0, c - b)); + } else { + return c + std::log10(1 + std::pow(10.0, a - c) + std::pow(10.0, b - c)); + } + } +}; diff --git a/src/util/math/normal_dist.h b/src/util/math/normal_dist.h new file mode 100644 index 0000000..0d585d0 --- /dev/null +++ b/src/util/math/normal_dist.h @@ -0,0 +1,48 @@ +/* + Description: 正态分布相关函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include + +struct NormalDistribution { + /** + * Default inverse cumulative probability accuracy. + * @since 2.1 + */ + static constexpr double DEFAULT_INVERSE_ABSOLUTE_ACCURACY = 1e-9; + /** √(2) */ + static constexpr double SQRT2 = 1.414213562373095; + + static constexpr double PI = 105414357.0 / 33554432.0 + 1.984187159361080883e-9; + + /** Mean of this distribution. */ + double mean; + /** Standard deviation of this distribution. */ + + double standardDeviation; + /** The value of {@code log(sd) + 0.5*log(2*pi)} stored for faster computation. */ + + double logStandardDeviationPlusHalfLog2Pi; + /** Inverse cumulative probability accuracy. */ + + double solverAbsoluteAccuracy; + + NormalDistribution(double mean, double sd) { + this->mean = mean; + standardDeviation = sd; + logStandardDeviationPlusHalfLog2Pi = std::log(sd) + 0.5 * std::log(2 * PI); + solverAbsoluteAccuracy = DEFAULT_INVERSE_ABSOLUTE_ACCURACY; + } + + double logDensity(double x) const { + const double x0 = x - mean; + const double x1 = x0 / standardDeviation; + return -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi; + } +}; \ No newline at end of file diff --git a/src/util/math/precision.cpp b/src/util/math/precision.cpp new file mode 100644 index 0000000..8ccd5e1 --- /dev/null +++ b/src/util/math/precision.cpp @@ -0,0 +1,13 @@ +#include "precision.h" + +const double Precision::EPSILON = MathFunc::longBitsToDouble((EXPONENT_OFFSET - 53l) << 52); + +const double Precision::SAFE_MIN = MathFunc::longBitsToDouble((EXPONENT_OFFSET - 1022l) << 52); + +const int64_t Precision::POSITIVE_ZERO_DOUBLE_BITS = MathFunc::doubleToRawLongBits(+0.0); + +const int64_t Precision::NEGATIVE_ZERO_DOUBLE_BITS = MathFunc::doubleToRawLongBits(-0.0); + +const int Precision::POSITIVE_ZERO_FLOAT_BITS = MathFunc::floatToRawIntBits(+0.0f); + +const int Precision::NEGATIVE_ZERO_FLOAT_BITS = MathFunc::floatToRawIntBits(-0.0f); \ No newline at end of file diff --git a/src/util/math/precision.h b/src/util/math/precision.h new file mode 100644 index 0000000..e1f3893 --- /dev/null +++ b/src/util/math/precision.h @@ -0,0 +1,122 @@ +/* + Description: 高精度相关的函数,如比较两个高精度浮点是否相等 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include +#include + + +#include "math_func.h" + +struct Precision { + /** + *

+ * Largest double-precision floating-point number such that + * {@code 1 + EPSILON} is numerically equal to 1. This value is an upper + * bound on the relative error due to rounding real numbers to double + * precision floating-point numbers. + *

+ *

+ * In IEEE 754 arithmetic, this is 2-53. + *

+ * + * @see Machine epsilon + */ + static const double EPSILON; + + /** + * Safe minimum, such that {@code 1 / SAFE_MIN} does not overflow. + *
+ * In IEEE 754 arithmetic, this is also the smallest normalized + * number 2-1022. + */ + static const double SAFE_MIN; + + /** Exponent offset in IEEE754 representation. */ + static constexpr int64_t EXPONENT_OFFSET = 1023l; + + /** Offset to order signed double numbers lexicographically. */ + static constexpr int64_t SGN_MASK = 0x8000000000000000L; + /** Offset to order signed double numbers lexicographically. */ + static constexpr int SGN_MASK_FLOAT = 0x80000000; + /** Positive zero. */ + static constexpr double POSITIVE_ZERO = 0.0; + /** Positive zero bits. */ + static const int64_t POSITIVE_ZERO_DOUBLE_BITS; + /** Negative zero bits. */ + static const int64_t NEGATIVE_ZERO_DOUBLE_BITS; + /** Positive zero bits. */ + static const int POSITIVE_ZERO_FLOAT_BITS; + /** Negative zero bits. */ + static const int NEGATIVE_ZERO_FLOAT_BITS; + + /** + * Returns {@code true} if there is no double value strictly between the + * arguments or the difference between them is within the range of allowed + * error (inclusive). + * + * @param x First value. + * @param y Second value. + * @param eps Amount of allowed absolute error. + * @return {@code true} if the values are two adjacent floating point + * numbers or they are within range of each other. + */ + static bool equals(double x, double y, double eps) { return equals(x, y, 1) || std::abs(y - x) <= eps; } + /** + * Returns true if both arguments are equal or within the range of allowed + * error (inclusive). + *

+ * Two float numbers are considered equal if there are {@code (maxUlps - 1)} + * (or fewer) floating point numbers between them, i.e. two adjacent + * floating point numbers are considered equal. + *

+ *

+ * Adapted from + * Bruce Dawson + *

+ * + * @param x first value + * @param y second value + * @param maxUlps {@code (maxUlps - 1)} is the number of floating point + * values between {@code x} and {@code y}. + * @return {@code true} if there are fewer than {@code maxUlps} floating + * point values between {@code x} and {@code y}. + */ + static bool equals(const double x, const double y, const int maxUlps) { + const int64_t xInt = MathFunc::doubleToRawLongBits(x); + const int64_t yInt = MathFunc::doubleToRawLongBits(y); + + bool isEqual; + + if (((xInt ^ yInt) & SGN_MASK) == 0l) { + // number have same sign, there is no risk of overflow + isEqual = std::abs(xInt - yInt) <= maxUlps; + } else { + // number have opposite signs, take care of overflow + int64_t deltaPlus; + int64_t deltaMinus; + if (xInt < yInt) { + deltaPlus = yInt - POSITIVE_ZERO_DOUBLE_BITS; + deltaMinus = xInt - NEGATIVE_ZERO_DOUBLE_BITS; + } else { + deltaPlus = xInt - POSITIVE_ZERO_DOUBLE_BITS; + deltaMinus = yInt - NEGATIVE_ZERO_DOUBLE_BITS; + } + + if (deltaPlus > maxUlps) { + isEqual = false; + } else { + isEqual = deltaMinus <= (maxUlps - deltaPlus); + } + } + + return isEqual && !std::isnan(x) && !std::isnan(y); + } +}; \ No newline at end of file diff --git a/src/util/math/saddle_pe.cpp b/src/util/math/saddle_pe.cpp new file mode 100644 index 0000000..0f8dad1 --- /dev/null +++ b/src/util/math/saddle_pe.cpp @@ -0,0 +1,37 @@ +#include "saddle_pe.h" + +const double SaddlePointExpansion::HALF_LOG_2_PI = 0.5 * std::log(TWO_PI); + +const double SaddlePointExpansion::EXACT_STIRLING_ERRORS[31] = { + 0.0, /* 0.0 */ + 0.1534264097200273452913848, /* 0.5 */ + 0.0810614667953272582196702, /* 1.0 */ + 0.0548141210519176538961390, /* 1.5 */ + 0.0413406959554092940938221, /* 2.0 */ + 0.03316287351993628748511048, /* 2.5 */ + 0.02767792568499833914878929, /* 3.0 */ + 0.02374616365629749597132920, /* 3.5 */ + 0.02079067210376509311152277, /* 4.0 */ + 0.01848845053267318523077934, /* 4.5 */ + 0.01664469118982119216319487, /* 5.0 */ + 0.01513497322191737887351255, /* 5.5 */ + 0.01387612882307074799874573, /* 6.0 */ + 0.01281046524292022692424986, /* 6.5 */ + 0.01189670994589177009505572, /* 7.0 */ + 0.01110455975820691732662991, /* 7.5 */ + 0.010411265261972096497478567, /* 8.0 */ + 0.009799416126158803298389475, /* 8.5 */ + 0.009255462182712732917728637, /* 9.0 */ + 0.008768700134139385462952823, /* 9.5 */ + 0.008330563433362871256469318, /* 10.0 */ + 0.007934114564314020547248100, /* 10.5 */ + 0.007573675487951840794972024, /* 11.0 */ + 0.007244554301320383179543912, /* 11.5 */ + 0.006942840107209529865664152, /* 12.0 */ + 0.006665247032707682442354394, /* 12.5 */ + 0.006408994188004207068439631, /* 13.0 */ + 0.006171712263039457647532867, /* 13.5 */ + 0.005951370112758847735624416, /* 14.0 */ + 0.005746216513010115682023589, /* 14.5 */ + 0.005554733551962801371038690 /* 15.0 */ +}; \ No newline at end of file diff --git a/src/util/math/saddle_pe.h b/src/util/math/saddle_pe.h new file mode 100644 index 0000000..0d859b6 --- /dev/null +++ b/src/util/math/saddle_pe.h @@ -0,0 +1,128 @@ +/* + Description: saddle point expansion 鞍点展开 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/26 +*/ +#pragma once + +#include +#include +#include "gamma.h" + +struct SaddlePointExpansion { + static constexpr double TWO_PI = 2 * (105414357.0 / 33554432.0 + 1.984187159361080883e-9); + + /** 1/2 * log(2 π). */ + static const double HALF_LOG_2_PI; + + /** exact Stirling expansion error for certain values. */ + static const double EXACT_STIRLING_ERRORS[31]; + /** + * A part of the deviance portion of the saddle point approximation. + *

+ * References: + *

    + *
  1. Catherine Loader (2000). "Fast and Accurate Computation of Binomial + * Probabilities.". + * http://www.herine.net/stat/papers/dbinom.pdf
  2. + *
+ *

+ * + * @param x the x value. + * @param mu the average. + * @return a part of the deviance. + */ + static double getDeviancePart(double x, double mu) { + double ret; + if (std::abs(x - mu) < 0.1 * (x + mu)) { + double d = x - mu; + double v = d / (x + mu); + double s1 = v * d; + double s = std::numeric_limits::quiet_NaN(); + double ej = 2.0 * x * v; + v *= v; + int j = 1; + while (s1 != s) { + s = s1; + ej *= v; + s1 = s + ej / ((j * 2) + 1); + ++j; + } + ret = s1; + } else { + ret = x * std::log(x / mu) + mu - x; + } + return ret; + } + + /** + * Compute the error of Stirling's series at the given value. + *

+ * References: + *

    + *
  1. Eric W. Weisstein. "Stirling's Series." From MathWorld--A Wolfram Web + * Resource. + * http://mathworld.wolfram.com/StirlingsSeries.html
  2. + *
+ *

+ * + * @param z the value. + * @return the Striling's series error. + */ + static double getStirlingError(double z) { + double ret; + if (z < 15.0) { + double z2 = 2.0 * z; + if (std::floor(z2) == z2) { + ret = EXACT_STIRLING_ERRORS[(int)z2]; + } else { + ret = Gamma::logGamma(z + 1.0) - (z + 0.5) * std::log(z) + z - HALF_LOG_2_PI; + } + } else { + double z2 = z * z; + ret = (0.083333333333333333333 - + (0.00277777777777777777778 - + (0.00079365079365079365079365 - (0.000595238095238095238095238 - 0.0008417508417508417508417508 / z2) / z2) / z2) / + z2) / + z; + } + return ret; + } + + /** + * Compute the logarithm of the PMF for a binomial distribution + * using the saddle point expansion. + * + * @param x the value at which the probability is evaluated. + * @param n the number of trials. + * @param p the probability of success. + * @param q the probability of failure (1 - p). + * @return log(p(x)). + */ + static double logBinomialProbability(int x, int n, double p, double q) { + double ret; + if (x == 0) { + if (p < 0.1) { + ret = -getDeviancePart(n, n * q) - n * p; + } else { + ret = n * std::log(q); + } + } else if (x == n) { + if (q < 0.1) { + ret = -getDeviancePart(n, n * p) - n * q; + } else { + ret = n * std::log(p); + } + } else { + ret = getStirlingError(n) - getStirlingError(x) - getStirlingError(n - x) - getDeviancePart(x, n * p) - getDeviancePart(n - x, n * q); + double f = (TWO_PI * x * (n - x)) / n; + ret = -0.5 * std::log(f) + ret; + } + return ret; + } +}; \ No newline at end of file diff --git a/src/util/profiling.cpp b/src/util/profiling.cpp index 00ede23..55706ba 100644 --- a/src/util/profiling.cpp +++ b/src/util/profiling.cpp @@ -57,6 +57,7 @@ int DisplayProfiling(int nthread) { PRINT_GP(read_ref); PRINT_GP(read_vcf); PRINT_GP(covariate); + PRINT_GP(update_info); //PRINT_GP(markdup); //PRINT_GP(intersect); // PRINT_GP(merge_result); diff --git a/src/util/profiling.h b/src/util/profiling.h index 713ea87..3e3e9e4 100644 --- a/src/util/profiling.h +++ b/src/util/profiling.h @@ -43,6 +43,7 @@ enum { GP_covariate, GP_read_ref, GP_read_vcf, + GP_update_info, GP_gen_wait, GP_sort_wait, GP_markdup_wait, diff --git a/src/util/report_table.cpp b/src/util/report_table.cpp new file mode 100644 index 0000000..443adae --- /dev/null +++ b/src/util/report_table.cpp @@ -0,0 +1,8 @@ +#include "report_table.h" + +const string ReportTable::COULD_NOT_READ_HEADER = "Could not read the header of this file -- "; +const string ReportTable::COULD_NOT_READ_COLUMN_NAMES = "Could not read the column names of this file -- "; +const string ReportTable::COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- "; +const string ReportTable::COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- "; +const string ReportTable::OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables"; +const char* ReportTable::TABLE_PREFIX = "GATKTable"; \ No newline at end of file diff --git a/src/util/report_table.h b/src/util/report_table.h new file mode 100644 index 0000000..23ae6c0 --- /dev/null +++ b/src/util/report_table.h @@ -0,0 +1,253 @@ +/* + Description: 生成数据报表 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/27 +*/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +using std::string; +using std::stringstream; +using std::vector; + +#define REPORT_HEADER_VERSION "#:GATKReport.v1.1:5" + +struct ReportUtil { + static string ToString(const bool val) { return val ? "true" : "false"; } + static string ToString(const char val) { + string s = ""; + s += val; + // spdlog::info("char: {}, str: {}", val, s); + return s; + } + static string ToString(const string& val) { return val == "" ? "null" : val; } + static string ToString(const double val, int precise) { + stringstream ss; + ss << std::fixed << std::setprecision(precise) << val; + return ss.str(); + } + template + static string ToString(const T val) { return std::to_string(val); } +}; + +struct ReportTable { + // 静态数据 + + static const string COULD_NOT_READ_HEADER; + static const string COULD_NOT_READ_COLUMN_NAMES; + static const string COULD_NOT_READ_DATA_LINE; + static const string COULD_NOT_READ_EMPTY_LINE; + static const string OLD_GATK_TABLE_VERSION; + static const char* TABLE_PREFIX; + + // 数据表排序方式 + // 好像目前都是按照SORT_BY_COLUMN排序的,这个排序是以此按照每一个cloumn内的所有行数据进行排序,从左到右,目的还是排列行的数据 + // SORT_BY_ROW应该就是按照row id排序的 + enum struct Sorting { SORT_BY_ROW, SORT_BY_COLUMN, DO_NOT_SORT }; + // 数据类型 + enum struct DataType { + /** + * The null type should not be used. + */ + NullType, + /** + * The default value when a format string is not present + */ + Unknown, + /** + * Used for boolean values. Will display as true or false in the table. + */ + Boolean, + /** + * Used for char values. Will display as a char so use printable values! + */ + Character, + /** + * Used for float and double values. Will output a decimal with format %.8f unless otherwise specified. + */ + Decimal, + /** + * Used for int, byte, short, and long values. Will display the full number by default. + */ + Integer, + /** + * Used for string values. Displays the string itself. + */ + String + }; + // 列类型 + struct Column { + // 只有column内容全是数字时(或者nan,inf,null这些),可以右对齐,剩下的字符或者字符串只能左对齐 + enum struct Alignment {LEFT, RIGHT}; + string columnName; + string format; + DataType dataType; + Alignment alignment = Alignment::RIGHT; + int maxWidth = 0; + + Column() {} + Column(const string& name) : Column(name, "%s") {} + Column(const string& name, const string& _format) : Column(name, _format, DataType::String) {} + Column(const string& name, const string& _format, const DataType _dataType) : Column(name, _format, _dataType, Alignment::RIGHT) {} + Column(const string& name, const string& _format, const DataType _dataType, const Alignment align) { init(name, _format, _dataType, align); } + + void init(const string& name, const string &_format, const DataType _dataType, const Alignment align) { + columnName = name; + format = _format; + dataType = _dataType; + alignment = align; + maxWidth = name.size(); + updateFormat(*this); // format和datatype要一致 + } + + void updateMaxWidth(const string &value) { + if (!value.empty()) { + maxWidth = std::max(maxWidth, (int)value.size()); + } + } + + // 根据col的format更新一下对齐等信息 + static void updateFormat(Column &col) { + if (col.format == "") { + col.dataType = DataType::Unknown; + col.format = "%s"; + } else if (col.format.find('s') != string::npos) { + col.dataType = DataType::String; + } else if (col.format.find('d') != string::npos) { + col.dataType = DataType::Integer; + } else if (col.format.find('f') != string::npos) { + col.dataType = DataType::Decimal; + } + // spdlog::info("type: {}, align {}", col.dataType == DataType::Integer, col.alignment == Alignment::LEFT); + // 只有数字可以右对齐 + if (col.dataType != DataType::Decimal && col.dataType != DataType::Integer) + col.alignment = Column::Alignment::LEFT; + } + + string getNameFormat() { return "%-" + std::to_string(maxWidth) + "s";} + + string getValueFormat() { + string align = alignment == Alignment::LEFT ? "-" : ""; + return "%" + align + std::to_string(maxWidth) + "s"; // 因为所有值都转换成string类型了 + } + }; + + using RowData = vector; // 每一行的数据 + + string tableName; // 数据表名称 + string tableDescription; // 数据表描述 + Sorting sortingWay; // 排序方式 + vector columnInfo; // 包含数据表的所有列信息 + vector underlyingData; // 保存数据表所有数据 + vector idxArr; // 保存data的索引,用来排序 + + ReportTable() {} + ReportTable(const string& _tableName) : ReportTable(_tableName, "") {} + ReportTable(const string& _tableName, const string& _tableDescription) : ReportTable(_tableName, _tableDescription, Sorting::SORT_BY_COLUMN) {} + ReportTable(const string& _tableName, const string& _tableDescription, const Sorting sorting) { init(_tableName, _tableDescription, sorting); } + + // 初始化 + void init(const string& _tableName, const string& _tableDescription, const Sorting sorting) { + tableName = _tableName; + tableDescription = _tableDescription; + sortingWay = sorting; + } + + // 添加column + void addColumn(Column col) { + // spdlog::info("col align: {}", col.alignment == Column::Alignment::RIGHT); + Column::updateFormat(col); + columnInfo.push_back(col); + } + + void addRowData(const RowData &dat) { + assert(dat.size() == columnInfo.size()); + idxArr.push_back(underlyingData.size()); + underlyingData.push_back(dat); + for (int i = 0; i < dat.size(); ++i) { + columnInfo[i].updateMaxWidth(dat[i]); + } + } + + void sort() { + std::sort(idxArr.begin(), idxArr.end(), [&](size_t a, size_t b) { + auto& r1 = underlyingData[a]; + auto& r2 = underlyingData[b]; + for (int i = 0; i < r1.size(); ++i) { + if (r1[i] != r2[i]) { + if (columnInfo[i].format.find('d') != string::npos) // 或者用datatype来判断 + return std::stoll(r1[i]) < std::stoll(r2[i]); + else if (columnInfo[i].format.find('f') != string::npos) { + return std::stod(r1[i]) < std::stod(r2[i]); + } else { + return r1[i] < r2[i]; + } + } + } + return a < b; + }); + } + + // 输出 + void write(FILE *fpout) { + // 表头,列数:行数:列格式 + fprintf(fpout, "#:%s:%d:%d:", TABLE_PREFIX, (int)columnInfo.size(), (int)underlyingData.size()); + for (auto& col : columnInfo) fprintf(fpout, "%s:", col.format.c_str()); + fprintf(fpout, ";\n"); + // 表的说明 + fprintf(fpout, "#:%s:%s:%s\n", TABLE_PREFIX, tableName.c_str(), tableDescription.c_str()); + // 列名称 + bool needPadding = false; + for (auto& col : columnInfo) { + if (needPadding) + fprintf(fpout, " "); + needPadding = true; + fprintf(fpout, col.getNameFormat().c_str(), col.columnName.c_str()); + } + fprintf(fpout, "\n"); + // 表的所有行数据 + // 先排个序 + sort(); + for (auto idx : idxArr) { + auto &row = underlyingData[idx]; + bool needPadding = false; + for (int i = 0; i < row.size(); ++i) { + if (needPadding) + fprintf(fpout, " "); + needPadding = true; + fprintf(fpout, columnInfo[i].getValueFormat().c_str(), row[i].c_str()); + } + fprintf(fpout, "\n"); + } + fprintf(fpout, "\n"); + } +}; + +/* +// 输出参数相关的数据 +struct ArgReportTable : ReportTable { + using ReportTable::ReportTable; +}; + +// 输出quantized quality相关的数据 +struct QuantReportTable : ReportTable { + +}; + +// 输出协变量相关的数据 +struct CovariateReportTable : ReportTable { + +}; +*/ \ No newline at end of file diff --git a/src/util/utils.h b/src/util/utils.h index 844eac1..4bd592d 100644 --- a/src/util/utils.h +++ b/src/util/utils.h @@ -1,4 +1,11 @@ #pragma once +#include + +using std::string; + // Report an error and exit -1 -void error(const char* format, ...); \ No newline at end of file +void error(const char* format, ...); + +struct Utils { +}; \ No newline at end of file diff --git a/test/stats.table b/test/stats.table index e69de29..02ae384 100644 --- a/test/stats.table +++ b/test/stats.table @@ -0,0 +1,1090 @@ +#:GATKReport.v1.1:5 +#:GATKTable:2:17:%s:%s:; +#:GATKTable:Arguments:Recalibration argument collection values used in this run +Argument Value +binary_tag_name null +covariate ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate +default_platform null +deletions_default_quality 45 +force_platform null +indels_context_size 3 +insertions_default_quality 45 +low_quality_tail 2 +maximum_cycle_value 500 +mismatches_context_size 2 +mismatches_default_quality -1 +no_standard_covs false +quantizing_levels 16 +recalibration_report null +run_without_dbsnp false +solid_nocall_strategy THROW_EXCEPTION +solid_recal_mode SET_Q_ZERO + +#:GATKTable:3:94:%d:%d:%d:; +#:GATKTable:Quantized:Quality quantization map +QualityScore Count QuantizedScore + 0 0 1 + 1 0 1 + 2 0 1 + 3 0 1 + 4 0 1 + 5 0 1 + 6 0 1 + 7 0 1 + 8 0 1 + 9 0 1 + 10 80007 10 + 11 0 1 + 12 0 1 + 13 0 1 + 14 0 1 + 15 0 1 + 16 0 1 + 17 0 1 + 18 0 1 + 19 0 1 + 20 0 1 + 21 0 1 + 22 0 1 + 23 0 1 + 24 132529 24 + 25 0 1 + 26 0 1 + 27 0 1 + 28 2647838 28 + 29 0 1 + 30 0 1 + 31 0 1 + 32 0 1 + 33 0 1 + 34 0 1 + 35 0 1 + 36 0 1 + 37 0 1 + 38 0 1 + 39 0 1 + 40 0 1 + 41 0 1 + 42 0 1 + 43 0 1 + 44 0 1 + 45 0 1 + 46 0 1 + 47 0 1 + 48 0 1 + 49 0 1 + 50 0 1 + 51 0 1 + 52 0 1 + 53 0 1 + 54 0 1 + 55 0 1 + 56 0 1 + 57 0 1 + 58 0 1 + 59 0 1 + 60 0 1 + 61 0 1 + 62 0 1 + 63 0 1 + 64 0 1 + 65 0 1 + 66 0 1 + 67 0 1 + 68 0 1 + 69 0 1 + 70 0 1 + 71 0 1 + 72 0 1 + 73 0 1 + 74 0 1 + 75 0 1 + 76 0 1 + 77 0 1 + 78 0 1 + 79 0 1 + 80 0 1 + 81 0 1 + 82 0 1 + 83 0 1 + 84 0 1 + 85 0 85 + 86 0 86 + 87 0 87 + 88 0 88 + 89 0 89 + 90 0 90 + 91 0 91 + 92 0 92 + 93 0 93 + +#:GATKTable:6:1:%s:%s:%.4f:%.4f:%d:%.2f:; +#:GATKTable:RecalTable0: +ReadGroup EventType EmpiricalQuality EstimatedQReported Observations Errors +normal M 23.0000 25.9295 2860374 12992.00 + +#:GATKTable:6:3:%s:%d:%s:%.4f:%d:%.2f:; +#:GATKTable:RecalTable1: +ReadGroup QualityScore EventType EmpiricalQuality Observations Errors +normal 11 M 10.0000 80007 7922.00 +normal 25 M 24.0000 132529 588.00 +normal 37 M 28.0000 2647838 4482.00 + +#:GATKTable:8:954:%s:%d:%s:%s:%s:%.4f:%d:%.2f:; +#:GATKTable:RecalTable2: +ReadGroup QualityScore CovariateValue CovariateName EventType EmpiricalQuality Observations Errors +normal 11 -1 Cycle M 11.0000 244 14.00 +normal 11 -10 Cycle M 11.0000 229 17.00 +normal 11 -100 Cycle M 10.0000 314 37.00 +normal 11 -101 Cycle M 11.0000 316 28.00 +normal 11 -102 Cycle M 10.0000 318 36.00 +normal 11 -103 Cycle M 10.0000 287 35.00 +normal 11 -104 Cycle M 11.0000 289 32.00 +normal 11 -105 Cycle M 11.0000 339 36.00 +normal 11 -106 Cycle M 10.0000 304 36.00 +normal 11 -107 Cycle M 11.0000 317 29.00 +normal 11 -108 Cycle M 11.0000 306 31.00 +normal 11 -109 Cycle M 10.0000 293 36.00 +normal 11 -11 Cycle M 11.0000 181 18.00 +normal 11 -110 Cycle M 10.0000 307 38.00 +normal 11 -111 Cycle M 11.0000 311 32.00 +normal 11 -112 Cycle M 11.0000 305 27.00 +normal 11 -113 Cycle M 11.0000 299 29.00 +normal 11 -114 Cycle M 11.0000 310 32.00 +normal 11 -115 Cycle M 10.0000 347 44.00 +normal 11 -116 Cycle M 10.0000 304 35.00 +normal 11 -117 Cycle M 10.0000 313 42.00 +normal 11 -118 Cycle M 11.0000 308 28.00 +normal 11 -119 Cycle M 10.0000 332 51.00 +normal 11 -12 Cycle M 11.0000 211 16.00 +normal 11 -120 Cycle M 10.0000 297 34.00 +normal 11 -121 Cycle M 11.0000 319 25.00 +normal 11 -122 Cycle M 11.0000 327 33.00 +normal 11 -123 Cycle M 10.0000 315 37.00 +normal 11 -124 Cycle M 10.0000 329 37.00 +normal 11 -125 Cycle M 11.0000 326 32.00 +normal 11 -126 Cycle M 11.0000 318 27.00 +normal 11 -127 Cycle M 11.0000 276 27.00 +normal 11 -128 Cycle M 11.0000 329 31.00 +normal 11 -129 Cycle M 10.0000 301 36.00 +normal 11 -13 Cycle M 11.0000 182 15.00 +normal 11 -130 Cycle M 11.0000 348 32.00 +normal 11 -131 Cycle M 10.0000 330 40.00 +normal 11 -132 Cycle M 11.0000 294 32.00 +normal 11 -133 Cycle M 11.0000 295 24.00 +normal 11 -134 Cycle M 10.0000 298 34.00 +normal 11 -135 Cycle M 10.0000 288 33.00 +normal 11 -136 Cycle M 11.0000 320 31.00 +normal 11 -137 Cycle M 10.0000 349 39.00 +normal 11 -138 Cycle M 11.0000 327 36.00 +normal 11 -139 Cycle M 10.0000 302 41.00 +normal 11 -14 Cycle M 11.0000 198 21.00 +normal 11 -140 Cycle M 11.0000 370 28.00 +normal 11 -141 Cycle M 10.0000 343 43.00 +normal 11 -142 Cycle M 11.0000 325 36.00 +normal 11 -143 Cycle M 11.0000 328 27.00 +normal 11 -144 Cycle M 10.0000 345 45.00 +normal 11 -145 Cycle M 10.0000 341 47.00 +normal 11 -146 Cycle M 11.0000 338 29.00 +normal 11 -147 Cycle M 10.0000 314 37.00 +normal 11 -148 Cycle M 11.0000 264 24.00 +normal 11 -149 Cycle M 11.0000 303 24.00 +normal 11 -15 Cycle M 11.0000 192 14.00 +normal 11 -150 Cycle M 11.0000 272 19.00 +normal 11 -151 Cycle M 11.0000 229 19.00 +normal 11 -16 Cycle M 11.0000 200 21.00 +normal 11 -17 Cycle M 11.0000 196 12.00 +normal 11 -18 Cycle M 11.0000 193 18.00 +normal 11 -19 Cycle M 11.0000 180 15.00 +normal 11 -2 Cycle M 11.0000 196 11.00 +normal 11 -20 Cycle M 11.0000 210 21.00 +normal 11 -21 Cycle M 11.0000 220 22.00 +normal 11 -22 Cycle M 11.0000 206 14.00 +normal 11 -23 Cycle M 11.0000 216 18.00 +normal 11 -24 Cycle M 11.0000 202 22.00 +normal 11 -25 Cycle M 11.0000 220 22.00 +normal 11 -26 Cycle M 11.0000 223 23.00 +normal 11 -27 Cycle M 11.0000 208 20.00 +normal 11 -28 Cycle M 11.0000 215 13.00 +normal 11 -29 Cycle M 10.0000 210 26.00 +normal 11 -3 Cycle M 11.0000 182 16.00 +normal 11 -30 Cycle M 11.0000 222 23.00 +normal 11 -31 Cycle M 11.0000 233 17.00 +normal 11 -32 Cycle M 11.0000 233 24.00 +normal 11 -33 Cycle M 11.0000 224 20.00 +normal 11 -34 Cycle M 11.0000 208 17.00 +normal 11 -35 Cycle M 11.0000 251 28.00 +normal 11 -36 Cycle M 11.0000 225 16.00 +normal 11 -37 Cycle M 11.0000 232 19.00 +normal 11 -38 Cycle M 10.0000 232 28.00 +normal 11 -39 Cycle M 11.0000 231 23.00 +normal 11 -4 Cycle M 11.0000 185 15.00 +normal 11 -40 Cycle M 10.0000 232 32.00 +normal 11 -41 Cycle M 10.0000 230 28.00 +normal 11 -42 Cycle M 11.0000 245 23.00 +normal 11 -43 Cycle M 11.0000 247 22.00 +normal 11 -44 Cycle M 11.0000 239 24.00 +normal 11 -45 Cycle M 11.0000 268 22.00 +normal 11 -46 Cycle M 11.0000 259 25.00 +normal 11 -47 Cycle M 11.0000 258 23.00 +normal 11 -48 Cycle M 11.0000 255 24.00 +normal 11 -49 Cycle M 11.0000 244 25.00 +normal 11 -5 Cycle M 11.0000 212 17.00 +normal 11 -50 Cycle M 11.0000 249 23.00 +normal 11 -51 Cycle M 10.0000 242 32.00 +normal 11 -52 Cycle M 11.0000 258 14.00 +normal 11 -53 Cycle M 11.0000 257 26.00 +normal 11 -54 Cycle M 10.0000 227 29.00 +normal 11 -55 Cycle M 11.0000 287 21.00 +normal 11 -56 Cycle M 11.0000 301 27.00 +normal 11 -57 Cycle M 11.0000 236 24.00 +normal 11 -58 Cycle M 10.0000 271 33.00 +normal 11 -59 Cycle M 11.0000 277 23.00 +normal 11 -6 Cycle M 11.0000 204 13.00 +normal 11 -60 Cycle M 11.0000 253 27.00 +normal 11 -61 Cycle M 11.0000 275 30.00 +normal 11 -62 Cycle M 11.0000 262 28.00 +normal 11 -63 Cycle M 11.0000 247 28.00 +normal 11 -64 Cycle M 11.0000 281 21.00 +normal 11 -65 Cycle M 10.0000 336 43.00 +normal 11 -66 Cycle M 11.0000 279 29.00 +normal 11 -67 Cycle M 10.0000 269 35.00 +normal 11 -68 Cycle M 10.0000 293 42.00 +normal 11 -69 Cycle M 11.0000 258 22.00 +normal 11 -7 Cycle M 11.0000 197 11.00 +normal 11 -70 Cycle M 10.0000 288 39.00 +normal 11 -71 Cycle M 11.0000 281 30.00 +normal 11 -72 Cycle M 10.0000 277 32.00 +normal 11 -73 Cycle M 11.0000 274 24.00 +normal 11 -74 Cycle M 11.0000 296 28.00 +normal 11 -75 Cycle M 10.0000 256 30.00 +normal 11 -76 Cycle M 11.0000 242 17.00 +normal 11 -77 Cycle M 10.0000 251 36.00 +normal 11 -78 Cycle M 10.0000 281 34.00 +normal 11 -79 Cycle M 11.0000 263 30.00 +normal 11 -8 Cycle M 11.0000 183 13.00 +normal 11 -80 Cycle M 11.0000 298 32.00 +normal 11 -81 Cycle M 10.0000 284 35.00 +normal 11 -82 Cycle M 11.0000 298 24.00 +normal 11 -83 Cycle M 10.0000 283 33.00 +normal 11 -84 Cycle M 11.0000 301 32.00 +normal 11 -85 Cycle M 11.0000 253 26.00 +normal 11 -86 Cycle M 11.0000 315 29.00 +normal 11 -87 Cycle M 11.0000 299 32.00 +normal 11 -88 Cycle M 11.0000 288 26.00 +normal 11 -89 Cycle M 11.0000 290 31.00 +normal 11 -9 Cycle M 11.0000 182 10.00 +normal 11 -90 Cycle M 10.0000 338 47.00 +normal 11 -91 Cycle M 11.0000 304 28.00 +normal 11 -92 Cycle M 11.0000 267 23.00 +normal 11 -93 Cycle M 10.0000 300 37.00 +normal 11 -94 Cycle M 10.0000 318 44.00 +normal 11 -95 Cycle M 11.0000 305 29.00 +normal 11 -96 Cycle M 10.0000 255 38.00 +normal 11 -97 Cycle M 11.0000 291 33.00 +normal 11 -98 Cycle M 11.0000 292 28.00 +normal 11 -99 Cycle M 11.0000 281 30.00 +normal 11 1 Cycle M 11.0000 225 8.00 +normal 11 10 Cycle M 11.0000 157 6.00 +normal 11 100 Cycle M 11.0000 334 32.00 +normal 11 101 Cycle M 11.0000 324 36.00 +normal 11 102 Cycle M 10.0000 319 39.00 +normal 11 103 Cycle M 10.0000 316 36.00 +normal 11 104 Cycle M 11.0000 322 32.00 +normal 11 105 Cycle M 10.0000 356 45.00 +normal 11 106 Cycle M 10.0000 373 44.00 +normal 11 107 Cycle M 11.0000 313 30.00 +normal 11 108 Cycle M 11.0000 329 35.00 +normal 11 109 Cycle M 11.0000 324 35.00 +normal 11 11 Cycle M 11.0000 143 5.00 +normal 11 110 Cycle M 10.0000 323 41.00 +normal 11 111 Cycle M 11.0000 320 34.00 +normal 11 112 Cycle M 10.0000 285 33.00 +normal 11 113 Cycle M 11.0000 317 34.00 +normal 11 114 Cycle M 11.0000 351 36.00 +normal 11 115 Cycle M 10.0000 335 39.00 +normal 11 116 Cycle M 11.0000 316 35.00 +normal 11 117 Cycle M 10.0000 303 37.00 +normal 11 118 Cycle M 11.0000 360 29.00 +normal 11 119 Cycle M 11.0000 335 29.00 +normal 11 12 Cycle M 12.0000 160 2.00 +normal 11 120 Cycle M 10.0000 303 41.00 +normal 11 121 Cycle M 10.0000 329 39.00 +normal 11 122 Cycle M 11.0000 352 33.00 +normal 11 123 Cycle M 10.0000 305 36.00 +normal 11 124 Cycle M 10.0000 312 48.00 +normal 11 125 Cycle M 11.0000 315 32.00 +normal 11 126 Cycle M 10.0000 347 42.00 +normal 11 127 Cycle M 10.0000 349 39.00 +normal 11 128 Cycle M 11.0000 302 19.00 +normal 11 129 Cycle M 10.0000 308 39.00 +normal 11 13 Cycle M 11.0000 139 7.00 +normal 11 130 Cycle M 11.0000 346 36.00 +normal 11 131 Cycle M 10.0000 331 37.00 +normal 11 132 Cycle M 10.0000 302 42.00 +normal 11 133 Cycle M 10.0000 339 45.00 +normal 11 134 Cycle M 10.0000 316 39.00 +normal 11 135 Cycle M 11.0000 356 38.00 +normal 11 136 Cycle M 11.0000 345 31.00 +normal 11 137 Cycle M 11.0000 303 28.00 +normal 11 138 Cycle M 11.0000 306 34.00 +normal 11 139 Cycle M 11.0000 286 31.00 +normal 11 14 Cycle M 11.0000 152 6.00 +normal 11 140 Cycle M 11.0000 319 33.00 +normal 11 141 Cycle M 10.0000 300 35.00 +normal 11 142 Cycle M 10.0000 315 36.00 +normal 11 143 Cycle M 11.0000 313 30.00 +normal 11 144 Cycle M 10.0000 298 37.00 +normal 11 145 Cycle M 10.0000 338 43.00 +normal 11 146 Cycle M 11.0000 299 26.00 +normal 11 147 Cycle M 11.0000 310 31.00 +normal 11 148 Cycle M 11.0000 294 33.00 +normal 11 149 Cycle M 11.0000 275 31.00 +normal 11 15 Cycle M 11.0000 141 5.00 +normal 11 150 Cycle M 11.0000 273 30.00 +normal 11 151 Cycle M 10.0000 257 33.00 +normal 11 16 Cycle M 11.0000 149 7.00 +normal 11 17 Cycle M 11.0000 151 9.00 +normal 11 18 Cycle M 11.0000 143 5.00 +normal 11 19 Cycle M 11.0000 138 6.00 +normal 11 2 Cycle M 11.0000 225 13.00 +normal 11 20 Cycle M 11.0000 142 3.00 +normal 11 21 Cycle M 11.0000 151 4.00 +normal 11 22 Cycle M 11.0000 144 4.00 +normal 11 23 Cycle M 11.0000 155 5.00 +normal 11 24 Cycle M 11.0000 169 5.00 +normal 11 25 Cycle M 11.0000 180 11.00 +normal 11 26 Cycle M 11.0000 199 14.00 +normal 11 27 Cycle M 11.0000 176 8.00 +normal 11 28 Cycle M 11.0000 181 16.00 +normal 11 29 Cycle M 11.0000 176 15.00 +normal 11 3 Cycle M 11.0000 183 17.00 +normal 11 30 Cycle M 11.0000 211 14.00 +normal 11 31 Cycle M 11.0000 191 13.00 +normal 11 32 Cycle M 11.0000 193 19.00 +normal 11 33 Cycle M 11.0000 200 20.00 +normal 11 34 Cycle M 11.0000 213 18.00 +normal 11 35 Cycle M 11.0000 212 12.00 +normal 11 36 Cycle M 11.0000 204 14.00 +normal 11 37 Cycle M 11.0000 191 15.00 +normal 11 38 Cycle M 11.0000 208 15.00 +normal 11 39 Cycle M 11.0000 214 17.00 +normal 11 4 Cycle M 11.0000 171 10.00 +normal 11 40 Cycle M 11.0000 198 19.00 +normal 11 41 Cycle M 11.0000 197 8.00 +normal 11 42 Cycle M 11.0000 225 24.00 +normal 11 43 Cycle M 11.0000 217 14.00 +normal 11 44 Cycle M 11.0000 206 12.00 +normal 11 45 Cycle M 11.0000 229 15.00 +normal 11 46 Cycle M 11.0000 208 16.00 +normal 11 47 Cycle M 11.0000 241 25.00 +normal 11 48 Cycle M 11.0000 228 21.00 +normal 11 49 Cycle M 11.0000 195 18.00 +normal 11 5 Cycle M 11.0000 150 5.00 +normal 11 50 Cycle M 11.0000 239 23.00 +normal 11 51 Cycle M 11.0000 236 27.00 +normal 11 52 Cycle M 11.0000 254 27.00 +normal 11 53 Cycle M 11.0000 245 28.00 +normal 11 54 Cycle M 10.0000 278 35.00 +normal 11 55 Cycle M 11.0000 238 20.00 +normal 11 56 Cycle M 11.0000 260 20.00 +normal 11 57 Cycle M 11.0000 230 26.00 +normal 11 58 Cycle M 11.0000 270 29.00 +normal 11 59 Cycle M 11.0000 246 28.00 +normal 11 6 Cycle M 11.0000 134 4.00 +normal 11 60 Cycle M 11.0000 271 19.00 +normal 11 61 Cycle M 11.0000 236 23.00 +normal 11 62 Cycle M 11.0000 244 23.00 +normal 11 63 Cycle M 11.0000 282 24.00 +normal 11 64 Cycle M 11.0000 229 22.00 +normal 11 65 Cycle M 11.0000 283 26.00 +normal 11 66 Cycle M 11.0000 264 27.00 +normal 11 67 Cycle M 11.0000 261 23.00 +normal 11 68 Cycle M 11.0000 293 28.00 +normal 11 69 Cycle M 11.0000 260 28.00 +normal 11 7 Cycle M 11.0000 165 5.00 +normal 11 70 Cycle M 11.0000 268 28.00 +normal 11 71 Cycle M 11.0000 253 29.00 +normal 11 72 Cycle M 11.0000 261 25.00 +normal 11 73 Cycle M 11.0000 255 27.00 +normal 11 74 Cycle M 11.0000 234 22.00 +normal 11 75 Cycle M 11.0000 249 21.00 +normal 11 76 Cycle M 11.0000 236 18.00 +normal 11 77 Cycle M 11.0000 255 21.00 +normal 11 78 Cycle M 11.0000 305 25.00 +normal 11 79 Cycle M 11.0000 278 31.00 +normal 11 8 Cycle M 11.0000 135 10.00 +normal 11 80 Cycle M 11.0000 304 34.00 +normal 11 81 Cycle M 11.0000 263 19.00 +normal 11 82 Cycle M 11.0000 263 27.00 +normal 11 83 Cycle M 11.0000 307 32.00 +normal 11 84 Cycle M 11.0000 314 28.00 +normal 11 85 Cycle M 10.0000 282 33.00 +normal 11 86 Cycle M 11.0000 298 26.00 +normal 11 87 Cycle M 11.0000 272 31.00 +normal 11 88 Cycle M 11.0000 301 32.00 +normal 11 89 Cycle M 11.0000 269 17.00 +normal 11 9 Cycle M 11.0000 174 8.00 +normal 11 90 Cycle M 11.0000 305 30.00 +normal 11 91 Cycle M 11.0000 323 29.00 +normal 11 92 Cycle M 10.0000 317 37.00 +normal 11 93 Cycle M 10.0000 315 44.00 +normal 11 94 Cycle M 11.0000 326 33.00 +normal 11 95 Cycle M 11.0000 275 22.00 +normal 11 96 Cycle M 11.0000 270 22.00 +normal 11 97 Cycle M 11.0000 317 34.00 +normal 11 98 Cycle M 11.0000 327 35.00 +normal 11 99 Cycle M 10.0000 308 36.00 +normal 11 AA Context M 8.0000 4395 691.00 +normal 11 AC Context M 10.0000 3117 346.00 +normal 11 AG Context M 14.0000 6166 213.00 +normal 11 AT Context M 11.0000 4828 421.00 +normal 11 CA Context M 7.0000 6906 1495.00 +normal 11 CC Context M 9.0000 5788 752.00 +normal 11 CG Context M 8.0000 3866 630.00 +normal 11 CT Context M 15.0000 8566 195.00 +normal 11 GA Context M 8.0000 4455 691.00 +normal 11 GC Context M 12.0000 4424 245.00 +normal 11 GG Context M 12.0000 6581 369.00 +normal 11 GT Context M 10.0000 5110 521.00 +normal 11 TA Context M 7.0000 3037 644.00 +normal 11 TC Context M 12.0000 3516 201.00 +normal 11 TG Context M 13.0000 4602 189.00 +normal 11 TT Context M 11.0000 4181 297.00 +normal 25 -1 Cycle M 25.0000 443 3.00 +normal 25 -10 Cycle M 25.0000 386 1.00 +normal 25 -100 Cycle M 25.0000 465 3.00 +normal 25 -101 Cycle M 25.0000 455 4.00 +normal 25 -102 Cycle M 25.0000 450 2.00 +normal 25 -103 Cycle M 25.0000 481 0.00 +normal 25 -104 Cycle M 25.0000 481 4.00 +normal 25 -105 Cycle M 25.0000 468 3.00 +normal 25 -106 Cycle M 25.0000 491 2.00 +normal 25 -107 Cycle M 25.0000 486 0.00 +normal 25 -108 Cycle M 25.0000 492 5.00 +normal 25 -109 Cycle M 25.0000 530 1.00 +normal 25 -11 Cycle M 25.0000 308 1.00 +normal 25 -110 Cycle M 25.0000 473 4.00 +normal 25 -111 Cycle M 25.0000 518 3.00 +normal 25 -112 Cycle M 25.0000 457 4.00 +normal 25 -113 Cycle M 25.0000 476 4.00 +normal 25 -114 Cycle M 25.0000 520 3.00 +normal 25 -115 Cycle M 25.0000 556 4.00 +normal 25 -116 Cycle M 25.0000 517 6.00 +normal 25 -117 Cycle M 25.0000 465 1.00 +normal 25 -118 Cycle M 25.0000 502 6.00 +normal 25 -119 Cycle M 25.0000 546 1.00 +normal 25 -12 Cycle M 25.0000 375 1.00 +normal 25 -120 Cycle M 25.0000 489 3.00 +normal 25 -121 Cycle M 25.0000 497 3.00 +normal 25 -122 Cycle M 25.0000 518 1.00 +normal 25 -123 Cycle M 25.0000 506 3.00 +normal 25 -124 Cycle M 25.0000 517 1.00 +normal 25 -125 Cycle M 25.0000 469 3.00 +normal 25 -126 Cycle M 25.0000 472 1.00 +normal 25 -127 Cycle M 25.0000 501 1.00 +normal 25 -128 Cycle M 25.0000 453 2.00 +normal 25 -129 Cycle M 25.0000 514 4.00 +normal 25 -13 Cycle M 25.0000 341 1.00 +normal 25 -130 Cycle M 25.0000 536 5.00 +normal 25 -131 Cycle M 25.0000 527 3.00 +normal 25 -132 Cycle M 25.0000 497 1.00 +normal 25 -133 Cycle M 25.0000 490 3.00 +normal 25 -134 Cycle M 25.0000 507 2.00 +normal 25 -135 Cycle M 25.0000 476 1.00 +normal 25 -136 Cycle M 25.0000 467 2.00 +normal 25 -137 Cycle M 25.0000 476 2.00 +normal 25 -138 Cycle M 25.0000 496 1.00 +normal 25 -139 Cycle M 25.0000 516 2.00 +normal 25 -14 Cycle M 25.0000 383 1.00 +normal 25 -140 Cycle M 25.0000 524 5.00 +normal 25 -141 Cycle M 25.0000 511 1.00 +normal 25 -142 Cycle M 25.0000 533 0.00 +normal 25 -143 Cycle M 25.0000 526 2.00 +normal 25 -144 Cycle M 25.0000 531 1.00 +normal 25 -145 Cycle M 25.0000 539 2.00 +normal 25 -146 Cycle M 25.0000 500 2.00 +normal 25 -147 Cycle M 25.0000 483 2.00 +normal 25 -148 Cycle M 25.0000 491 3.00 +normal 25 -149 Cycle M 25.0000 470 1.00 +normal 25 -15 Cycle M 25.0000 364 1.00 +normal 25 -150 Cycle M 25.0000 431 4.00 +normal 25 -151 Cycle M 25.0000 422 0.00 +normal 25 -16 Cycle M 25.0000 378 1.00 +normal 25 -17 Cycle M 25.0000 388 4.00 +normal 25 -18 Cycle M 25.0000 382 0.00 +normal 25 -19 Cycle M 25.0000 375 0.00 +normal 25 -2 Cycle M 25.0000 375 0.00 +normal 25 -20 Cycle M 25.0000 356 2.00 +normal 25 -21 Cycle M 25.0000 379 2.00 +normal 25 -22 Cycle M 25.0000 405 3.00 +normal 25 -23 Cycle M 25.0000 371 2.00 +normal 25 -24 Cycle M 25.0000 376 3.00 +normal 25 -25 Cycle M 25.0000 397 1.00 +normal 25 -26 Cycle M 25.0000 358 2.00 +normal 25 -27 Cycle M 25.0000 386 3.00 +normal 25 -28 Cycle M 25.0000 409 1.00 +normal 25 -29 Cycle M 25.0000 327 3.00 +normal 25 -3 Cycle M 25.0000 389 2.00 +normal 25 -30 Cycle M 25.0000 408 1.00 +normal 25 -31 Cycle M 25.0000 380 0.00 +normal 25 -32 Cycle M 25.0000 402 1.00 +normal 25 -33 Cycle M 25.0000 420 4.00 +normal 25 -34 Cycle M 25.0000 430 1.00 +normal 25 -35 Cycle M 25.0000 389 1.00 +normal 25 -36 Cycle M 25.0000 398 0.00 +normal 25 -37 Cycle M 25.0000 390 1.00 +normal 25 -38 Cycle M 25.0000 429 2.00 +normal 25 -39 Cycle M 25.0000 404 2.00 +normal 25 -4 Cycle M 25.0000 359 0.00 +normal 25 -40 Cycle M 25.0000 455 1.00 +normal 25 -41 Cycle M 25.0000 412 0.00 +normal 25 -42 Cycle M 25.0000 398 2.00 +normal 25 -43 Cycle M 25.0000 431 0.00 +normal 25 -44 Cycle M 25.0000 439 4.00 +normal 25 -45 Cycle M 25.0000 423 2.00 +normal 25 -46 Cycle M 25.0000 415 3.00 +normal 25 -47 Cycle M 25.0000 473 1.00 +normal 25 -48 Cycle M 25.0000 412 0.00 +normal 25 -49 Cycle M 25.0000 431 1.00 +normal 25 -5 Cycle M 25.0000 403 1.00 +normal 25 -50 Cycle M 25.0000 405 0.00 +normal 25 -51 Cycle M 25.0000 397 3.00 +normal 25 -52 Cycle M 25.0000 414 1.00 +normal 25 -53 Cycle M 25.0000 411 1.00 +normal 25 -54 Cycle M 25.0000 428 3.00 +normal 25 -55 Cycle M 25.0000 424 2.00 +normal 25 -56 Cycle M 25.0000 425 1.00 +normal 25 -57 Cycle M 25.0000 501 2.00 +normal 25 -58 Cycle M 25.0000 473 2.00 +normal 25 -59 Cycle M 25.0000 433 3.00 +normal 25 -6 Cycle M 25.0000 316 2.00 +normal 25 -60 Cycle M 25.0000 434 4.00 +normal 25 -61 Cycle M 25.0000 456 1.00 +normal 25 -62 Cycle M 25.0000 481 5.00 +normal 25 -63 Cycle M 25.0000 474 2.00 +normal 25 -64 Cycle M 25.0000 482 1.00 +normal 25 -65 Cycle M 25.0000 483 2.00 +normal 25 -66 Cycle M 25.0000 469 2.00 +normal 25 -67 Cycle M 25.0000 507 0.00 +normal 25 -68 Cycle M 25.0000 516 2.00 +normal 25 -69 Cycle M 25.0000 476 1.00 +normal 25 -7 Cycle M 25.0000 348 4.00 +normal 25 -70 Cycle M 25.0000 436 4.00 +normal 25 -71 Cycle M 25.0000 478 5.00 +normal 25 -72 Cycle M 25.0000 442 2.00 +normal 25 -73 Cycle M 25.0000 395 0.00 +normal 25 -74 Cycle M 25.0000 486 1.00 +normal 25 -75 Cycle M 25.0000 488 3.00 +normal 25 -76 Cycle M 25.0000 446 2.00 +normal 25 -77 Cycle M 25.0000 449 3.00 +normal 25 -78 Cycle M 25.0000 478 4.00 +normal 25 -79 Cycle M 25.0000 516 3.00 +normal 25 -8 Cycle M 25.0000 367 1.00 +normal 25 -80 Cycle M 25.0000 471 5.00 +normal 25 -81 Cycle M 25.0000 456 5.00 +normal 25 -82 Cycle M 25.0000 492 2.00 +normal 25 -83 Cycle M 25.0000 473 5.00 +normal 25 -84 Cycle M 25.0000 498 3.00 +normal 25 -85 Cycle M 25.0000 538 2.00 +normal 25 -86 Cycle M 25.0000 523 3.00 +normal 25 -87 Cycle M 25.0000 476 3.00 +normal 25 -88 Cycle M 25.0000 455 3.00 +normal 25 -89 Cycle M 25.0000 509 1.00 +normal 25 -9 Cycle M 25.0000 319 1.00 +normal 25 -90 Cycle M 25.0000 537 2.00 +normal 25 -91 Cycle M 25.0000 499 2.00 +normal 25 -92 Cycle M 25.0000 503 1.00 +normal 25 -93 Cycle M 25.0000 452 3.00 +normal 25 -94 Cycle M 25.0000 495 5.00 +normal 25 -95 Cycle M 25.0000 473 0.00 +normal 25 -96 Cycle M 25.0000 471 1.00 +normal 25 -97 Cycle M 25.0000 467 1.00 +normal 25 -98 Cycle M 25.0000 465 4.00 +normal 25 -99 Cycle M 25.0000 466 0.00 +normal 25 1 Cycle M 25.0000 365 0.00 +normal 25 10 Cycle M 25.0000 355 0.00 +normal 25 100 Cycle M 25.0000 444 2.00 +normal 25 101 Cycle M 25.0000 515 4.00 +normal 25 102 Cycle M 25.0000 442 2.00 +normal 25 103 Cycle M 25.0000 490 1.00 +normal 25 104 Cycle M 25.0000 482 1.00 +normal 25 105 Cycle M 25.0000 556 5.00 +normal 25 106 Cycle M 25.0000 491 3.00 +normal 25 107 Cycle M 25.0000 484 3.00 +normal 25 108 Cycle M 25.0000 502 1.00 +normal 25 109 Cycle M 25.0000 490 2.00 +normal 25 11 Cycle M 25.0000 293 1.00 +normal 25 110 Cycle M 25.0000 448 2.00 +normal 25 111 Cycle M 25.0000 469 4.00 +normal 25 112 Cycle M 25.0000 534 4.00 +normal 25 113 Cycle M 25.0000 491 4.00 +normal 25 114 Cycle M 25.0000 454 2.00 +normal 25 115 Cycle M 25.0000 512 4.00 +normal 25 116 Cycle M 25.0000 495 5.00 +normal 25 117 Cycle M 25.0000 445 1.00 +normal 25 118 Cycle M 25.0000 477 3.00 +normal 25 119 Cycle M 25.0000 486 3.00 +normal 25 12 Cycle M 25.0000 322 0.00 +normal 25 120 Cycle M 25.0000 506 2.00 +normal 25 121 Cycle M 25.0000 451 0.00 +normal 25 122 Cycle M 25.0000 508 3.00 +normal 25 123 Cycle M 25.0000 461 5.00 +normal 25 124 Cycle M 25.0000 492 3.00 +normal 25 125 Cycle M 25.0000 474 0.00 +normal 25 126 Cycle M 25.0000 478 2.00 +normal 25 127 Cycle M 25.0000 509 4.00 +normal 25 128 Cycle M 25.0000 440 1.00 +normal 25 129 Cycle M 25.0000 481 2.00 +normal 25 13 Cycle M 25.0000 305 1.00 +normal 25 130 Cycle M 25.0000 475 1.00 +normal 25 131 Cycle M 25.0000 467 3.00 +normal 25 132 Cycle M 25.0000 466 4.00 +normal 25 133 Cycle M 25.0000 514 1.00 +normal 25 134 Cycle M 25.0000 523 1.00 +normal 25 135 Cycle M 25.0000 509 8.00 +normal 25 136 Cycle M 25.0000 451 1.00 +normal 25 137 Cycle M 25.0000 477 2.00 +normal 25 138 Cycle M 25.0000 474 1.00 +normal 25 139 Cycle M 25.0000 483 3.00 +normal 25 14 Cycle M 25.0000 328 1.00 +normal 25 140 Cycle M 25.0000 501 6.00 +normal 25 141 Cycle M 25.0000 478 4.00 +normal 25 142 Cycle M 25.0000 477 0.00 +normal 25 143 Cycle M 25.0000 490 1.00 +normal 25 144 Cycle M 25.0000 457 3.00 +normal 25 145 Cycle M 25.0000 468 2.00 +normal 25 146 Cycle M 25.0000 455 0.00 +normal 25 147 Cycle M 25.0000 460 6.00 +normal 25 148 Cycle M 25.0000 466 3.00 +normal 25 149 Cycle M 25.0000 480 3.00 +normal 25 15 Cycle M 25.0000 306 1.00 +normal 25 150 Cycle M 25.0000 428 0.00 +normal 25 151 Cycle M 25.0000 389 2.00 +normal 25 16 Cycle M 25.0000 336 0.00 +normal 25 17 Cycle M 25.0000 340 0.00 +normal 25 18 Cycle M 25.0000 321 0.00 +normal 25 19 Cycle M 25.0000 312 1.00 +normal 25 2 Cycle M 25.0000 329 2.00 +normal 25 20 Cycle M 25.0000 345 1.00 +normal 25 21 Cycle M 25.0000 362 0.00 +normal 25 22 Cycle M 25.0000 324 1.00 +normal 25 23 Cycle M 25.0000 326 1.00 +normal 25 24 Cycle M 25.0000 328 0.00 +normal 25 25 Cycle M 25.0000 340 2.00 +normal 25 26 Cycle M 25.0000 313 0.00 +normal 25 27 Cycle M 25.0000 339 0.00 +normal 25 28 Cycle M 25.0000 352 2.00 +normal 25 29 Cycle M 25.0000 341 5.00 +normal 25 3 Cycle M 25.0000 360 0.00 +normal 25 30 Cycle M 25.0000 373 1.00 +normal 25 31 Cycle M 25.0000 352 1.00 +normal 25 32 Cycle M 25.0000 358 2.00 +normal 25 33 Cycle M 25.0000 354 2.00 +normal 25 34 Cycle M 25.0000 375 1.00 +normal 25 35 Cycle M 25.0000 402 1.00 +normal 25 36 Cycle M 25.0000 357 0.00 +normal 25 37 Cycle M 25.0000 373 1.00 +normal 25 38 Cycle M 25.0000 390 2.00 +normal 25 39 Cycle M 25.0000 350 2.00 +normal 25 4 Cycle M 25.0000 323 0.00 +normal 25 40 Cycle M 25.0000 365 0.00 +normal 25 41 Cycle M 25.0000 380 1.00 +normal 25 42 Cycle M 25.0000 399 0.00 +normal 25 43 Cycle M 25.0000 375 2.00 +normal 25 44 Cycle M 25.0000 385 2.00 +normal 25 45 Cycle M 25.0000 380 0.00 +normal 25 46 Cycle M 25.0000 362 4.00 +normal 25 47 Cycle M 25.0000 403 2.00 +normal 25 48 Cycle M 25.0000 394 1.00 +normal 25 49 Cycle M 25.0000 420 1.00 +normal 25 5 Cycle M 25.0000 323 2.00 +normal 25 50 Cycle M 25.0000 393 0.00 +normal 25 51 Cycle M 25.0000 368 1.00 +normal 25 52 Cycle M 25.0000 409 2.00 +normal 25 53 Cycle M 25.0000 371 1.00 +normal 25 54 Cycle M 25.0000 419 4.00 +normal 25 55 Cycle M 25.0000 432 2.00 +normal 25 56 Cycle M 25.0000 428 1.00 +normal 25 57 Cycle M 25.0000 424 1.00 +normal 25 58 Cycle M 25.0000 418 0.00 +normal 25 59 Cycle M 25.0000 476 1.00 +normal 25 6 Cycle M 25.0000 317 1.00 +normal 25 60 Cycle M 25.0000 435 2.00 +normal 25 61 Cycle M 25.0000 411 1.00 +normal 25 62 Cycle M 25.0000 415 3.00 +normal 25 63 Cycle M 25.0000 416 3.00 +normal 25 64 Cycle M 25.0000 409 1.00 +normal 25 65 Cycle M 25.0000 410 4.00 +normal 25 66 Cycle M 25.0000 439 1.00 +normal 25 67 Cycle M 25.0000 431 2.00 +normal 25 68 Cycle M 25.0000 463 2.00 +normal 25 69 Cycle M 25.0000 442 1.00 +normal 25 7 Cycle M 25.0000 318 1.00 +normal 25 70 Cycle M 25.0000 446 2.00 +normal 25 71 Cycle M 25.0000 456 2.00 +normal 25 72 Cycle M 25.0000 418 4.00 +normal 25 73 Cycle M 25.0000 459 2.00 +normal 25 74 Cycle M 25.0000 452 2.00 +normal 25 75 Cycle M 25.0000 444 1.00 +normal 25 76 Cycle M 25.0000 463 1.00 +normal 25 77 Cycle M 25.0000 467 2.00 +normal 25 78 Cycle M 25.0000 459 3.00 +normal 25 79 Cycle M 25.0000 473 1.00 +normal 25 8 Cycle M 25.0000 313 3.00 +normal 25 80 Cycle M 25.0000 494 2.00 +normal 25 81 Cycle M 25.0000 435 2.00 +normal 25 82 Cycle M 25.0000 422 2.00 +normal 25 83 Cycle M 25.0000 471 3.00 +normal 25 84 Cycle M 25.0000 463 0.00 +normal 25 85 Cycle M 25.0000 493 3.00 +normal 25 86 Cycle M 25.0000 458 0.00 +normal 25 87 Cycle M 25.0000 443 0.00 +normal 25 88 Cycle M 25.0000 442 2.00 +normal 25 89 Cycle M 25.0000 498 0.00 +normal 25 9 Cycle M 25.0000 308 2.00 +normal 25 90 Cycle M 25.0000 466 1.00 +normal 25 91 Cycle M 25.0000 498 0.00 +normal 25 92 Cycle M 25.0000 469 0.00 +normal 25 93 Cycle M 25.0000 475 4.00 +normal 25 94 Cycle M 25.0000 460 0.00 +normal 25 95 Cycle M 25.0000 479 0.00 +normal 25 96 Cycle M 25.0000 451 3.00 +normal 25 97 Cycle M 25.0000 459 3.00 +normal 25 98 Cycle M 25.0000 484 3.00 +normal 25 99 Cycle M 25.0000 499 3.00 +normal 25 AA Context M 23.0000 8191 67.00 +normal 25 AC Context M 24.0000 4921 29.00 +normal 25 AG Context M 25.0000 9122 26.00 +normal 25 AT Context M 25.0000 7783 25.00 +normal 25 CA Context M 22.0000 11018 110.00 +normal 25 CC Context M 23.0000 10079 76.00 +normal 25 CG Context M 24.0000 5097 36.00 +normal 25 CT Context M 26.0000 12765 26.00 +normal 25 GA Context M 24.0000 8689 45.00 +normal 25 GC Context M 26.0000 8589 12.00 +normal 25 GG Context M 26.0000 11109 16.00 +normal 25 GT Context M 25.0000 8168 25.00 +normal 25 TA Context M 24.0000 5772 33.00 +normal 25 TC Context M 25.0000 5671 13.00 +normal 25 TG Context M 25.0000 7675 27.00 +normal 25 TT Context M 25.0000 7071 19.00 +normal 37 -1 Cycle M 35.0000 9315 42.00 +normal 37 -10 Cycle M 35.0000 9402 37.00 +normal 37 -100 Cycle M 37.0000 8691 7.00 +normal 37 -101 Cycle M 37.0000 8686 7.00 +normal 37 -102 Cycle M 36.0000 8673 19.00 +normal 37 -103 Cycle M 36.0000 8645 11.00 +normal 37 -104 Cycle M 36.0000 8617 15.00 +normal 37 -105 Cycle M 37.0000 8544 5.00 +normal 37 -106 Cycle M 37.0000 8520 2.00 +normal 37 -107 Cycle M 37.0000 8481 9.00 +normal 37 -108 Cycle M 36.0000 8455 15.00 +normal 37 -109 Cycle M 37.0000 8394 8.00 +normal 37 -11 Cycle M 35.0000 9528 33.00 +normal 37 -110 Cycle M 36.0000 8415 10.00 +normal 37 -111 Cycle M 36.0000 8335 10.00 +normal 37 -112 Cycle M 36.0000 8382 17.00 +normal 37 -113 Cycle M 36.0000 8345 13.00 +normal 37 -114 Cycle M 36.0000 8260 10.00 +normal 37 -115 Cycle M 36.0000 8160 11.00 +normal 37 -116 Cycle M 36.0000 8208 16.00 +normal 37 -117 Cycle M 37.0000 8214 9.00 +normal 37 -118 Cycle M 37.0000 8151 4.00 +normal 37 -119 Cycle M 37.0000 8060 9.00 +normal 37 -12 Cycle M 36.0000 9431 23.00 +normal 37 -120 Cycle M 37.0000 8128 9.00 +normal 37 -121 Cycle M 37.0000 8073 9.00 +normal 37 -122 Cycle M 36.0000 8014 16.00 +normal 37 -123 Cycle M 37.0000 8001 3.00 +normal 37 -124 Cycle M 37.0000 7947 5.00 +normal 37 -125 Cycle M 36.0000 7970 15.00 +normal 37 -126 Cycle M 37.0000 7941 2.00 +normal 37 -127 Cycle M 36.0000 7901 15.00 +normal 37 -128 Cycle M 37.0000 7854 1.00 +normal 37 -129 Cycle M 36.0000 7783 10.00 +normal 37 -13 Cycle M 36.0000 9494 21.00 +normal 37 -130 Cycle M 36.0000 7687 11.00 +normal 37 -131 Cycle M 36.0000 7680 16.00 +normal 37 -132 Cycle M 36.0000 7711 12.00 +normal 37 -133 Cycle M 36.0000 7689 11.00 +normal 37 -134 Cycle M 37.0000 7634 8.00 +normal 37 -135 Cycle M 36.0000 7636 12.00 +normal 37 -136 Cycle M 36.0000 7566 10.00 +normal 37 -137 Cycle M 37.0000 7477 7.00 +normal 37 -138 Cycle M 36.0000 7430 11.00 +normal 37 -139 Cycle M 37.0000 7388 6.00 +normal 37 -14 Cycle M 36.0000 9436 23.00 +normal 37 -140 Cycle M 36.0000 7261 10.00 +normal 37 -141 Cycle M 36.0000 7266 16.00 +normal 37 -142 Cycle M 37.0000 7207 4.00 +normal 37 -143 Cycle M 37.0000 7166 6.00 +normal 37 -144 Cycle M 36.0000 7099 14.00 +normal 37 -145 Cycle M 36.0000 7046 11.00 +normal 37 -146 Cycle M 35.0000 7039 32.00 +normal 37 -147 Cycle M 37.0000 7023 7.00 +normal 37 -148 Cycle M 36.0000 6979 17.00 +normal 37 -149 Cycle M 37.0000 6701 7.00 +normal 37 -15 Cycle M 36.0000 9460 15.00 +normal 37 -150 Cycle M 36.0000 6124 11.00 +normal 37 -151 Cycle M 36.0000 3973 14.00 +normal 37 -16 Cycle M 36.0000 9439 11.00 +normal 37 -17 Cycle M 35.0000 9433 31.00 +normal 37 -18 Cycle M 36.0000 9442 14.00 +normal 37 -19 Cycle M 36.0000 9462 22.00 +normal 37 -2 Cycle M 35.0000 9446 37.00 +normal 37 -20 Cycle M 37.0000 9449 6.00 +normal 37 -21 Cycle M 36.0000 9416 17.00 +normal 37 -22 Cycle M 36.0000 9404 15.00 +normal 37 -23 Cycle M 36.0000 9427 17.00 +normal 37 -24 Cycle M 37.0000 9437 8.00 +normal 37 -25 Cycle M 36.0000 9398 22.00 +normal 37 -26 Cycle M 37.0000 9433 6.00 +normal 37 -27 Cycle M 36.0000 9418 12.00 +normal 37 -28 Cycle M 36.0000 9388 13.00 +normal 37 -29 Cycle M 36.0000 9474 17.00 +normal 37 -3 Cycle M 36.0000 9446 19.00 +normal 37 -30 Cycle M 36.0000 9380 17.00 +normal 37 -31 Cycle M 36.0000 9397 18.00 +normal 37 -32 Cycle M 36.0000 9374 16.00 +normal 37 -33 Cycle M 36.0000 9365 13.00 +normal 37 -34 Cycle M 36.0000 9369 19.00 +normal 37 -35 Cycle M 35.0000 9367 31.00 +normal 37 -36 Cycle M 36.0000 9383 19.00 +normal 37 -37 Cycle M 35.0000 9384 37.00 +normal 37 -38 Cycle M 37.0000 9344 5.00 +normal 37 -39 Cycle M 36.0000 9370 12.00 +normal 37 -4 Cycle M 36.0000 9473 23.00 +normal 37 -40 Cycle M 36.0000 9315 11.00 +normal 37 -41 Cycle M 36.0000 9360 15.00 +normal 37 -42 Cycle M 36.0000 9359 25.00 +normal 37 -43 Cycle M 37.0000 9323 8.00 +normal 37 -44 Cycle M 37.0000 9321 7.00 +normal 37 -45 Cycle M 36.0000 9308 15.00 +normal 37 -46 Cycle M 37.0000 9324 8.00 +normal 37 -47 Cycle M 36.0000 9266 13.00 +normal 37 -48 Cycle M 35.0000 9327 29.00 +normal 37 -49 Cycle M 36.0000 9320 26.00 +normal 37 -5 Cycle M 35.0000 9402 45.00 +normal 37 -50 Cycle M 36.0000 9340 24.00 +normal 37 -51 Cycle M 37.0000 9352 8.00 +normal 37 -52 Cycle M 36.0000 9315 17.00 +normal 37 -53 Cycle M 36.0000 9315 10.00 +normal 37 -54 Cycle M 36.0000 9326 13.00 +normal 37 -55 Cycle M 36.0000 9268 12.00 +normal 37 -56 Cycle M 36.0000 9251 14.00 +normal 37 -57 Cycle M 36.0000 9236 22.00 +normal 37 -58 Cycle M 37.0000 9225 7.00 +normal 37 -59 Cycle M 36.0000 9257 11.00 +normal 37 -6 Cycle M 34.0000 9497 51.00 +normal 37 -60 Cycle M 36.0000 9279 12.00 +normal 37 -61 Cycle M 36.0000 9235 13.00 +normal 37 -62 Cycle M 36.0000 9220 26.00 +normal 37 -63 Cycle M 36.0000 9238 13.00 +normal 37 -64 Cycle M 36.0000 9186 13.00 +normal 37 -65 Cycle M 36.0000 9122 21.00 +normal 37 -66 Cycle M 36.0000 9185 17.00 +normal 37 -67 Cycle M 36.0000 9153 15.00 +normal 37 -68 Cycle M 37.0000 9114 6.00 +normal 37 -69 Cycle M 36.0000 9183 11.00 +normal 37 -7 Cycle M 36.0000 9472 19.00 +normal 37 -70 Cycle M 36.0000 9192 15.00 +normal 37 -71 Cycle M 37.0000 9153 6.00 +normal 37 -72 Cycle M 36.0000 9187 16.00 +normal 37 -73 Cycle M 36.0000 9224 10.00 +normal 37 -74 Cycle M 37.0000 9099 7.00 +normal 37 -75 Cycle M 37.0000 9126 7.00 +normal 37 -76 Cycle M 37.0000 9175 9.00 +normal 37 -77 Cycle M 36.0000 9151 15.00 +normal 37 -78 Cycle M 36.0000 9079 16.00 +normal 37 -79 Cycle M 36.0000 9052 13.00 +normal 37 -8 Cycle M 34.0000 9467 54.00 +normal 37 -80 Cycle M 37.0000 9054 8.00 +normal 37 -81 Cycle M 36.0000 9076 18.00 +normal 37 -82 Cycle M 36.0000 9020 13.00 +normal 37 -83 Cycle M 36.0000 9046 10.00 +normal 37 -84 Cycle M 36.0000 8977 20.00 +normal 37 -85 Cycle M 37.0000 8968 6.00 +normal 37 -86 Cycle M 36.0000 8898 13.00 +normal 37 -87 Cycle M 37.0000 8941 9.00 +normal 37 -88 Cycle M 36.0000 8959 11.00 +normal 37 -89 Cycle M 36.0000 8896 14.00 +normal 37 -9 Cycle M 36.0000 9516 18.00 +normal 37 -90 Cycle M 37.0000 8804 2.00 +normal 37 -91 Cycle M 36.0000 8862 13.00 +normal 37 -92 Cycle M 36.0000 8879 11.00 +normal 37 -93 Cycle M 36.0000 8885 10.00 +normal 37 -94 Cycle M 36.0000 8803 16.00 +normal 37 -95 Cycle M 37.0000 8817 9.00 +normal 37 -96 Cycle M 37.0000 8845 7.00 +normal 37 -97 Cycle M 36.0000 8777 11.00 +normal 37 -98 Cycle M 36.0000 8757 12.00 +normal 37 -99 Cycle M 36.0000 8739 27.00 +normal 37 1 Cycle M 34.0000 9431 49.00 +normal 37 10 Cycle M 35.0000 9510 29.00 +normal 37 100 Cycle M 36.0000 8709 12.00 +normal 37 101 Cycle M 37.0000 8634 7.00 +normal 37 102 Cycle M 36.0000 8694 17.00 +normal 37 103 Cycle M 36.0000 8620 10.00 +normal 37 104 Cycle M 36.0000 8597 18.00 +normal 37 105 Cycle M 37.0000 8453 5.00 +normal 37 106 Cycle M 37.0000 8465 9.00 +normal 37 107 Cycle M 36.0000 8504 20.00 +normal 37 108 Cycle M 36.0000 8440 12.00 +normal 37 109 Cycle M 37.0000 8420 6.00 +normal 37 11 Cycle M 36.0000 9586 27.00 +normal 37 110 Cycle M 36.0000 8443 11.00 +normal 37 111 Cycle M 37.0000 8393 9.00 +normal 37 112 Cycle M 36.0000 8346 14.00 +normal 37 113 Cycle M 36.0000 8334 11.00 +normal 37 114 Cycle M 36.0000 8307 12.00 +normal 37 115 Cycle M 36.0000 8237 10.00 +normal 37 116 Cycle M 36.0000 8237 14.00 +normal 37 117 Cycle M 37.0000 8266 8.00 +normal 37 118 Cycle M 36.0000 8142 13.00 +normal 37 119 Cycle M 36.0000 8137 13.00 +normal 37 12 Cycle M 35.0000 9540 29.00 +normal 37 120 Cycle M 36.0000 8127 15.00 +normal 37 121 Cycle M 36.0000 8133 14.00 +normal 37 122 Cycle M 36.0000 8022 10.00 +normal 37 123 Cycle M 37.0000 8080 5.00 +normal 37 124 Cycle M 36.0000 8013 10.00 +normal 37 125 Cycle M 36.0000 7999 16.00 +normal 37 126 Cycle M 37.0000 7932 8.00 +normal 37 127 Cycle M 36.0000 7849 10.00 +normal 37 128 Cycle M 37.0000 7926 4.00 +normal 37 129 Cycle M 36.0000 7840 10.00 +normal 37 13 Cycle M 36.0000 9578 18.00 +normal 37 130 Cycle M 37.0000 7780 7.00 +normal 37 131 Cycle M 36.0000 7763 15.00 +normal 37 132 Cycle M 37.0000 7759 9.00 +normal 37 133 Cycle M 36.0000 7649 11.00 +normal 37 134 Cycle M 37.0000 7627 5.00 +normal 37 135 Cycle M 36.0000 7563 10.00 +normal 37 136 Cycle M 36.0000 7582 12.00 +normal 37 137 Cycle M 37.0000 7543 9.00 +normal 37 138 Cycle M 37.0000 7494 2.00 +normal 37 139 Cycle M 37.0000 7457 5.00 +normal 37 14 Cycle M 36.0000 9542 23.00 +normal 37 140 Cycle M 36.0000 7355 10.00 +normal 37 141 Cycle M 36.0000 7361 22.00 +normal 37 142 Cycle M 37.0000 7297 7.00 +normal 37 143 Cycle M 36.0000 7248 10.00 +normal 37 144 Cycle M 36.0000 7253 16.00 +normal 37 145 Cycle M 36.0000 7152 18.00 +normal 37 146 Cycle M 36.0000 7149 23.00 +normal 37 147 Cycle M 37.0000 7086 5.00 +normal 37 148 Cycle M 36.0000 7016 17.00 +normal 37 149 Cycle M 37.0000 6806 8.00 +normal 37 15 Cycle M 36.0000 9575 26.00 +normal 37 150 Cycle M 36.0000 6226 13.00 +normal 37 151 Cycle M 36.0000 4100 10.00 +normal 37 16 Cycle M 36.0000 9537 19.00 +normal 37 17 Cycle M 36.0000 9531 23.00 +normal 37 18 Cycle M 36.0000 9558 10.00 +normal 37 19 Cycle M 36.0000 9572 21.00 +normal 37 2 Cycle M 35.0000 9468 29.00 +normal 37 20 Cycle M 36.0000 9535 13.00 +normal 37 21 Cycle M 36.0000 9509 19.00 +normal 37 22 Cycle M 36.0000 9554 17.00 +normal 37 23 Cycle M 36.0000 9541 15.00 +normal 37 24 Cycle M 36.0000 9525 10.00 +normal 37 25 Cycle M 36.0000 9502 16.00 +normal 37 26 Cycle M 36.0000 9509 15.00 +normal 37 27 Cycle M 36.0000 9506 16.00 +normal 37 28 Cycle M 36.0000 9486 12.00 +normal 37 29 Cycle M 36.0000 9502 23.00 +normal 37 3 Cycle M 36.0000 9479 15.00 +normal 37 30 Cycle M 36.0000 9435 21.00 +normal 37 31 Cycle M 36.0000 9476 14.00 +normal 37 32 Cycle M 36.0000 9468 19.00 +normal 37 33 Cycle M 37.0000 9465 9.00 +normal 37 34 Cycle M 36.0000 9431 25.00 +normal 37 35 Cycle M 36.0000 9405 26.00 +normal 37 36 Cycle M 36.0000 9457 16.00 +normal 37 37 Cycle M 36.0000 9454 22.00 +normal 37 38 Cycle M 37.0000 9419 9.00 +normal 37 39 Cycle M 36.0000 9453 14.00 +normal 37 4 Cycle M 36.0000 9528 23.00 +normal 37 40 Cycle M 36.0000 9453 18.00 +normal 37 41 Cycle M 36.0000 9439 16.00 +normal 37 42 Cycle M 36.0000 9391 24.00 +normal 37 43 Cycle M 36.0000 9423 20.00 +normal 37 44 Cycle M 37.0000 9422 7.00 +normal 37 45 Cycle M 36.0000 9404 18.00 +normal 37 46 Cycle M 37.0000 9442 9.00 +normal 37 47 Cycle M 35.0000 9366 30.00 +normal 37 48 Cycle M 36.0000 9387 23.00 +normal 37 49 Cycle M 36.0000 9394 19.00 +normal 37 5 Cycle M 35.0000 9549 45.00 +normal 37 50 Cycle M 35.0000 9376 31.00 +normal 37 51 Cycle M 37.0000 9403 4.00 +normal 37 52 Cycle M 36.0000 9343 15.00 +normal 37 53 Cycle M 36.0000 9388 11.00 +normal 37 54 Cycle M 36.0000 9305 14.00 +normal 37 55 Cycle M 36.0000 9331 25.00 +normal 37 56 Cycle M 36.0000 9311 14.00 +normal 37 57 Cycle M 36.0000 9341 13.00 +normal 37 58 Cycle M 36.0000 9303 14.00 +normal 37 59 Cycle M 36.0000 9268 16.00 +normal 37 6 Cycle M 35.0000 9571 41.00 +normal 37 60 Cycle M 36.0000 9282 10.00 +normal 37 61 Cycle M 36.0000 9341 19.00 +normal 37 62 Cycle M 35.0000 9327 35.00 +normal 37 63 Cycle M 37.0000 9283 9.00 +normal 37 64 Cycle M 36.0000 9333 11.00 +normal 37 65 Cycle M 36.0000 9269 11.00 +normal 37 66 Cycle M 36.0000 9249 17.00 +normal 37 67 Cycle M 36.0000 9256 25.00 +normal 37 68 Cycle M 36.0000 9188 11.00 +normal 37 69 Cycle M 36.0000 9237 15.00 +normal 37 7 Cycle M 36.0000 9539 22.00 +normal 37 70 Cycle M 36.0000 9223 10.00 +normal 37 71 Cycle M 36.0000 9222 11.00 +normal 37 72 Cycle M 36.0000 9246 16.00 +normal 37 73 Cycle M 36.0000 9200 11.00 +normal 37 74 Cycle M 36.0000 9218 14.00 +normal 37 75 Cycle M 37.0000 9200 8.00 +normal 37 76 Cycle M 36.0000 9187 12.00 +normal 37 77 Cycle M 36.0000 9151 14.00 +normal 37 78 Cycle M 36.0000 9096 10.00 +normal 37 79 Cycle M 37.0000 9103 7.00 +normal 37 8 Cycle M 34.0000 9574 52.00 +normal 37 80 Cycle M 37.0000 9050 7.00 +normal 37 81 Cycle M 36.0000 9141 12.00 +normal 37 82 Cycle M 37.0000 9147 9.00 +normal 37 83 Cycle M 36.0000 9045 12.00 +normal 37 84 Cycle M 36.0000 9018 11.00 +normal 37 85 Cycle M 36.0000 9003 11.00 +normal 37 86 Cycle M 36.0000 9001 12.00 +normal 37 87 Cycle M 37.0000 9023 5.00 +normal 37 88 Cycle M 36.0000 8982 11.00 +normal 37 89 Cycle M 36.0000 8949 12.00 +normal 37 9 Cycle M 36.0000 9540 19.00 +normal 37 90 Cycle M 36.0000 8931 12.00 +normal 37 91 Cycle M 37.0000 8864 8.00 +normal 37 92 Cycle M 36.0000 8883 16.00 +normal 37 93 Cycle M 36.0000 8869 10.00 +normal 37 94 Cycle M 36.0000 8854 18.00 +normal 37 95 Cycle M 36.0000 8865 15.00 +normal 37 96 Cycle M 37.0000 8871 9.00 +normal 37 97 Cycle M 36.0000 8783 16.00 +normal 37 98 Cycle M 36.0000 8723 12.00 +normal 37 99 Cycle M 36.0000 8698 20.00 +normal 37 AA Context M 30.0000 118778 229.00 +normal 37 AC Context M 32.0000 131768 182.00 +normal 37 AG Context M 29.0000 195076 376.00 +normal 37 AT Context M 32.0000 110579 145.00 +normal 37 CA Context M 30.0000 180677 289.00 +normal 37 CC Context M 28.0000 274062 556.00 +normal 37 CG Context M 28.0000 103541 333.00 +normal 37 CT Context M 30.0000 195552 296.00 +normal 37 GA Context M 31.0000 148632 227.00 +normal 37 GC Context M 35.0000 209944 101.00 +normal 37 GG Context M 33.0000 270736 212.00 +normal 37 GT Context M 31.0000 123976 220.00 +normal 37 TA Context M 33.0000 106100 113.00 +normal 37 TC Context M 30.0000 154473 259.00 +normal 37 TG Context M 25.0000 184288 764.00 +normal 37 TT Context M 34.0000 120882 89.00 +