/* Description: bqsr计算过程中需要记录的一些数据 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2025/12/24 */ #pragma once #include #include #include "qual_utils.h" #include "util/math/normal_dist.h" #include "util/math/math_utils.h" /** * The container for the 4-tuple * * ( reported quality, empirical quality, num observations, num mismatches/errors ) * * for a given set of covariates. * */ struct RecalDatum { static constexpr uint8_t MAX_RECALIBRATED_Q_SCORE = 93; // SAMUtils.MAX_PHRED_SCORE; static constexpr int UNINITIALIZED_EMPIRICAL_QUALITY = -1; static constexpr double MULTIPLIER = 100000.0; // See discussion in numMismatches about what the multiplier is. /** * used when calculating empirical qualities to avoid division by zero */ static constexpr int SMOOTHING_CONSTANT = 1; static constexpr uint64_t MAX_NUMBER_OF_OBSERVATIONS = INT_MAX - 1; /** * Quals above this value should be capped down to this value (because they are too high) * in the base quality score recalibrator */ static constexpr uint8_t MAX_GATK_USABLE_Q_SCORE = 40; static double logPriorCache[MAX_GATK_USABLE_Q_SCORE + 1]; static void StaticInit() { // normal distribution describing P(empiricalQuality - reportedQuality). Its mean is zero because a priori we expect // no systematic bias in the reported quality score const double mean = 0.0; const double sigma = 0.5; // with these parameters, deltas can shift at most ~20 Q points const NormalDistribution gaussian(mean, sigma); for (int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++) { logPriorCache[i] = gaussian.logDensity(i); } } /** * Estimated reported quality score based on combined data's individual q-reporteds and number of observations. * The estimating occurs when collapsing counts across different reported qualities. */ // 测序仪给出的原始质量分数 double reportedQuality = 0.0; /** * The empirical quality for datums that have been collapsed together (by read group and reported quality, for example). * * This variable was historically a double, but {@link #bayesianEstimateOfEmpiricalQuality} has always returned an integer qual score. * Thus the type has been changed to integer in February 2025 to highlight this implementation detail. It does not change the output. */ // 计算出来的真实质量分数 int empiricalQuality = 0; /** * Number of bases seen in total */ // 这个字段也用来判断当前datum的有效性,只有到numObservations > 0时,这个datum才有效,因为如果为0,说明这个datum都没有出现过 uint64_t numObservations = 0; /** * Number of bases seen that didn't match the reference * (actually sum of the error weights - so not necessarily a whole number) * Stored with an internal multiplier to keep it closer to the floating-point sweet spot and avoid numerical error * (see https://github.com/broadinstitute/gatk/wiki/Numerical-errors ). * However, the value of the multiplier influences the results. * For example, you get different results for 1000.0 and 10000.0 * See MathUtilsUnitTest.testAddDoubles for a demonstration. * The value of the MULTIPLIER that we found to give consistent results insensitive to sorting is 10000.0; */ double numMismatches = 0.0; RecalDatum() {} RecalDatum(const uint64_t _numObservations, const double _numMismatches, const uint8_t _reportedQuality) { numObservations = _numObservations; numMismatches = _numMismatches * MULTIPLIER; reportedQuality = _reportedQuality; empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; } inline void increment(const uint64_t incObservations, const double incMismatches) { numObservations += incObservations; numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that. empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; } inline void increment(const uint64_t incObservations, const double incMismatches, int baseQuality) { numObservations += incObservations; numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that. reportedQuality = baseQuality; empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; } inline void increment(const RecalDatum& other) { numObservations += other.numObservations; numMismatches += other.numMismatches; reportedQuality = other.reportedQuality; empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; } /** * Add in all of the data from other into this object, updating the reported quality from the expected * error rate implied by the two reported qualities. * * For example (the only example?), this method is called when collapsing the counts across reported quality scores within * the same read group. * * @param other RecalDatum to combine */ void combine(const RecalDatum& other) { // this is the *expected* (or theoretical) number of errors given the reported qualities and the number of observations. double expectedNumErrors = this->calcExpectedErrors() + other.calcExpectedErrors(); // increment the counts increment(other.getNumObservations(), other.getNumMismatches()); // we use the theoretical count above to compute the "estimated" reported quality // after combining two datums with different reported qualities. reportedQuality = -10 * log10(expectedNumErrors / getNumObservations()); empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY; } /** * calculate the expected number of errors given the estimated Q reported and the number of observations * in this datum. * * @return a positive (potentially fractional) estimate of the number of errors */ inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); } inline double getNumMismatches() const { return numMismatches / MULTIPLIER; } inline uint64_t getNumObservations() const { return numObservations; } inline double getReportedQuality() const { return reportedQuality; } /** * Computes the empirical quality of the datum, using the reported quality as the prior. * @see #getEmpiricalQuality(double) below. */ double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); } /** * Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum. */ double getEmpiricalQuality(const double priorQualityScore) { if (empiricalQuality == UNINITIALIZED_EMPIRICAL_QUALITY) { calcEmpiricalQuality(priorQualityScore); } return empiricalQuality; } /** * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) */ void calcEmpiricalQuality(const double priorQualityScore) { // smoothing is one error and one non-error observation const uint64_t mismatches = (uint64_t)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; // TODO: why add 0.5? const uint64_t observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; const int empiricalQual = bayesianEstimateOfEmpiricalQuality(observations, mismatches, priorQualityScore); empiricalQuality = std::min(empiricalQual, (int)MAX_RECALIBRATED_Q_SCORE); } /** * Compute the maximum a posteriori (MAP) estimate of the probability of sequencing error under the following model. * * Let * X = number of sequencing errors, * n = number of observations, * theta = probability of sequencing error as a quality score, * theta_rep = probability of sequencing error reported by the sequencing machine as a quality score. * * The prior and the likelihood are: * * P(theta|theta_rep) ~ Gaussian(theta - theta_rep| 0, 0.5) (Note this is done in log space) * P(X|n, theta) ~ Binom(X|n,theta) * * Note the prior is equivalent to * * P(theta|theta_rep) ~ Gaussian(theta | theta_rep, 0.5) * * TODO: use beta prior to do away with the search. * * @param nObservations n in the model above. * @param nErrors the observed number of sequencing errors. * @param priorMeanQualityScore the prior quality score, often the reported quality score. * * @return phredScale quality score that maximizes the posterior probability. */ static int bayesianEstimateOfEmpiricalQuality(const uint64_t nObservations, const uint64_t nErrors, const double priorMeanQualityScore) { const int numQualityScoreBins = (QualityUtils::MAX_REASONABLE_Q_SCORE + 1); double logPosteriors[numQualityScoreBins]; for (int i = 0; i < numQualityScoreBins; ++i) { logPosteriors[i] = getLogPrior(i, priorMeanQualityScore) + getLogBinomialLikelihood(i, nObservations, nErrors); } return MathUtils::maxElementIndex(logPosteriors, 0, numQualityScoreBins); } static double getLogPrior(const double qualityScore, const double priorQualityScore) { const int difference = std::min(std::abs((int)(qualityScore - priorQualityScore)), (int)MAX_GATK_USABLE_Q_SCORE); return logPriorCache[difference]; } /** * Given: * - n, the number of observations, * - k, the number of sequencing errors, * - p, the probability of error, encoded as the quality score. * * Return the binomial probability Bin(k|n,p). * * The method handles the case when the counts of type long are higher than the maximum allowed integer value, * Integer.MAX_VALUE = (2^31)-1 ~= 2*10^9, since the library we use for binomial probability expects integer input. * */ static double getLogBinomialLikelihood(const double qualityScore, uint64_t nObservations, uint64_t nErrors) { if (nObservations == 0) return 0.0; // the binomial code requires ints as input (because it does caching). This should theoretically be fine because // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow // before casting down to an int. if (nObservations > MAX_NUMBER_OF_OBSERVATIONS) { // we need to decrease nErrors by the same fraction that we are decreasing nObservations const double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; nErrors = std::round((double)nErrors * fraction); nObservations = MAX_NUMBER_OF_OBSERVATIONS; } // this is just a straight binomial PDF const double logLikelihood = MathUtils::logBinomialProbability((int)nObservations, (int)nErrors, QualityUtils::qualToErrorProb(qualityScore)); return (std::isinf(logLikelihood) || std::isnan(logLikelihood)) ? -DBL_MAX : logLikelihood; } };