/* Description: 量化质量分数 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2025/12/26 */ #pragma once #include #include #include #include "qual_utils.h" using std::set; using std::vector; /** * A general algorithm for quantizing quality score distributions to use a specific number of levels * * Takes a histogram of quality scores and a desired number of levels and produces a * map from original quality scores -> quantized quality scores. * * Note that this data structure is fairly heavy-weight, holding lots of debugging and * calculation information. If you want to use it efficiently at scale with lots of * read groups the right way to do this: * * Map> map * for each read group rg: * hist = getQualHist(rg) * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) * map.set(rg, qq.getOriginalToQuantizedMap()) * * This map would then be used to look up the appropriate original -> quantized * quals for each read as it comes in. */ struct QualQuantizer { /** * Represents a contiguous interval of quality scores. * * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 */ struct QualInterval { int qStart, qEnd, fixedQual, level; int64_t nObservations, nErrors; set subIntervals; /** for debugging / visualization. When was this interval created? */ int mergeOrder; void init(const int _qStart, const int _qEnd, const int64_t _nObservations, const int64_t _nErrors, const int _level, const int _fixedQual) { qStart = _qStart; qEnd = _qEnd; nObservations = _nObservations; nErrors = _nErrors; level = _level; fixedQual = _fixedQual; } QualInterval() { qStart = -1; qEnd = -1; nObservations = -1; nErrors = -1; fixedQual = -1; level = -1; mergeOrder = 0; } QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level) { init(qStart, qEnd, nObservations, nErrors, level, -1); } QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level, const set& _subIntervals) { init(qStart, qEnd, nObservations, nErrors, level, -1); subIntervals = _subIntervals; } QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level, const int fixedQual) { init(qStart, qEnd, nObservations, nErrors, level, fixedQual); } QualInterval(const int qStart, const int qEnd, const int64_t nObservations, const int64_t nErrors, const int level, const int fixedQual, const set& _subIntervals) { init(qStart, qEnd, nObservations, nErrors, level, fixedQual); subIntervals = _subIntervals; } /** * @return Human readable name of this interval: e.g., 10-12 */ string getName() const { return std::to_string(qStart) + "-" + std::to_string(qEnd); } string toString() const { return "QQ:" + getName(); } /** * @return true if this bin is using a fixed qual */ bool hasFixedQual() const { return fixedQual != -1; } /** * @return the error rate (in real space) of this interval, or 0 if there are no observations */ double getErrorRate() const { if (hasFixedQual()) return QualityUtils::qualToErrorProb((uint8_t)fixedQual); else if (nObservations == 0) return 0.0; else return (nErrors + 1) / (1.0 * (nObservations + 1)); } /** * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. */ uint8_t getQual() const { if (!hasFixedQual()) return QualityUtils::errorProbToQual(getErrorRate()); else return (uint8_t)fixedQual; } int compareTo(const QualInterval& qi) const { return qStart < qi.qStart ? -1 : (qStart == qi.qStart ? 0 : 1); } /** * Create a interval representing the merge of this interval and toMerge * * Errors and observations are combined * Subintervals updated in order of left to right (determined by qStart) * Level is 1 + highest level of this and toMerge * Order must be updated elsewhere * * @param toMerge * @return newly created merged QualInterval */ QualInterval merge(const QualInterval& toMerge) const { const QualInterval &left = this->compareTo(toMerge) < 0 ? *this : toMerge; const QualInterval &right = this->compareTo(toMerge) < 0 ? toMerge : *this; if (left.qEnd + 1 != right.qStart) { // throw new GATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); std::cerr << "Attempting to merge non-contiguous intervals: left = " + left.toString() + " right = " + right.toString() << std::endl; exit(1); } const int64_t nCombinedObs = left.nObservations + right.nObservations; const int64_t nCombinedErr = left.nErrors + right.nErrors; const int level = std::max(left.level, right.level) + 1; set subIntervals; subIntervals.insert(left); subIntervals.insert(right); QualInterval merged(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); return merged; } double getPenalty(const int minInterestingQual) const { return calcPenalty(getErrorRate(), minInterestingQual); } /** * Calculate the penalty of this interval, given the overall error rate for the interval * * If the globalErrorRate is e, this value is: * * sum_i |log10(e_i) - log10(e)| * nObservations_i * * each the index i applies to all leaves of the tree accessible from this interval * (found recursively from subIntervals as necessary) * * @param globalErrorRate overall error rate in real space against which we calculate the penalty * @return the cost of approximating the bins in this interval with the globalErrorRate */ double calcPenalty(const double globalErrorRate, const int minInterestingQual) const { if (globalErrorRate == 0.0) // there were no observations, so there's no penalty return 0.0; if (subIntervals.empty()) { // this is leave node if (this->qEnd <= minInterestingQual) // It's free to merge up quality scores below the smallest interesting one return 0; else { return (std::abs(std::log10(getErrorRate()) - std::log10(globalErrorRate))) * nObservations; } } else { double sum = 0; for (const QualInterval interval : subIntervals) sum += interval.calcPenalty(globalErrorRate, minInterestingQual); return sum; } } bool operator<(const QualInterval& o) const { return qStart < o.qStart; } QualInterval& operator=(const QualInterval& o) { if (this == &o) return *this; init(o.qStart, o.qEnd, o.nObservations, o.nErrors, o.level, o.fixedQual); mergeOrder = o.mergeOrder; subIntervals.clear(); for (auto& val : o.subIntervals) subIntervals.insert(val); return *this; } }; /** * Inputs to the QualQuantizer */ const int nLevels, minInterestingQual; vector& nObservationsPerQual; QualQuantizer(vector& _nObservationsPerQual, const int _nLevels, const int _minInterestingQual) : nObservationsPerQual(_nObservationsPerQual), nLevels(_nLevels), minInterestingQual(_minInterestingQual) { quantize(); } /** Sorted set of qual intervals. * * After quantize() this data structure contains only the top-level qual intervals */ set quantizedIntervals; /** * Represents a contiguous interval of quality scores. * * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 */ void getOriginalToQuantizedMap(vector& quantMap) { quantMap.resize(getNQualsInHistogram(), UINT8_MAX); for (auto& interval : quantizedIntervals) { for (int q = interval.qStart; q <= interval.qEnd; q++) { quantMap[q] = interval.getQual(); } } // if (Collections.min(map) == Byte.MIN_VALUE) throw new GATKException("quantized quality score map contains an un-initialized value"); } int getNQualsInHistogram() { return nObservationsPerQual.size(); } /** * Main method for computing the quantization intervals. * * Invoked in the constructor after all input variables are initialized. Walks * over the inputs and builds the min. penalty forest of intervals with exactly nLevel * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed * to find the optimal combination. * * TODO: develop a smarter algorithm * * @return the forest of intervals with size == nLevels */ void quantize() { // create intervals for each qual individually auto& intervals = quantizedIntervals; for (int qStart = 0; qStart < getNQualsInHistogram(); qStart++) { const int64_t nObs = nObservationsPerQual.at(qStart); const double errorRate = QualityUtils::qualToErrorProb((uint8_t)qStart); const double nErrors = nObs * errorRate; const QualInterval qi(qStart, qStart, nObs, (int)std::floor(nErrors), 0, (uint8_t)qStart); intervals.insert(qi); } // greedy algorithm: // while ( n intervals >= nLevels ): // find intervals to merge with least penalty // merge it while (intervals.size() > nLevels) { mergeLowestPenaltyIntervals(intervals); } } /** * Helper function that finds and merges together the lowest penalty pair of intervals * @param intervals */ void mergeLowestPenaltyIntervals(set& intervals) { // setup the iterators auto it1 = intervals.begin(); auto it1p = intervals.begin(); ++it1p; // skip one // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty QualInterval minMerge; // if (logger.isDebugEnabled()) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); int lastMergeOrder = 0; while (it1p != intervals.end()) { const QualInterval& left = *it1; const QualInterval& right = *it1p; const QualInterval merged = left.merge(right); lastMergeOrder = std::max(std::max(lastMergeOrder, left.mergeOrder), right.mergeOrder); if (minMerge.qStart == -1 || (merged.getPenalty(minInterestingQual) < minMerge.getPenalty(minInterestingQual))) { // if (logger.isDebugEnabled()) logger.debug(" Updating merge " + minMerge); minMerge = merged; // merge two bins that when merged incur the lowest "penalty" } ++it1; ++it1p; } // now actually go ahead and merge the minMerge pair // if (logger.isDebugEnabled()) logger.debug(" => const min merge " + minMerge); minMerge.mergeOrder = lastMergeOrder + 1; // intervals.removeAll(minMerge.subIntervals); for (auto &itr : minMerge.subIntervals) { intervals.erase(itr); } intervals.insert(minMerge); // if (logger.isDebugEnabled()) logger.debug("updated intervals: " + intervals); } };