FastBQSR/src/bqsr/recal_datum.h

253 lines
11 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: bqsr计算过程中需要记录的一些数据
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/24
*/
#pragma once
#include <stdint.h>
#include <float.h>
#include "qual_utils.h"
#include "util/math/normal_dist.h"
#include "util/math/math_utils.h"
/**
* The container for the 4-tuple
*
* ( reported quality, empirical quality, num observations, num mismatches/errors )
*
* for a given set of covariates.
*
*/
struct RecalDatum {
static constexpr uint8_t MAX_RECALIBRATED_Q_SCORE = 93; // SAMUtils.MAX_PHRED_SCORE;
static constexpr int UNINITIALIZED_EMPIRICAL_QUALITY = -1;
static constexpr double MULTIPLIER = 100000.0; // See discussion in numMismatches about what the multiplier is.
/**
* used when calculating empirical qualities to avoid division by zero
*/
static constexpr int SMOOTHING_CONSTANT = 1;
static constexpr uint64_t MAX_NUMBER_OF_OBSERVATIONS = INT_MAX - 1;
/**
* Quals above this value should be capped down to this value (because they are too high)
* in the base quality score recalibrator
*/
static constexpr uint8_t MAX_GATK_USABLE_Q_SCORE = 40;
static double logPriorCache[MAX_GATK_USABLE_Q_SCORE + 1];
static void StaticInit() {
// normal distribution describing P(empiricalQuality - reportedQuality). Its mean is zero because a priori we expect
// no systematic bias in the reported quality score
const double mean = 0.0;
const double sigma = 0.5; // with these parameters, deltas can shift at most ~20 Q points
const NormalDistribution gaussian(mean, sigma);
for (int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++) {
logPriorCache[i] = gaussian.logDensity(i);
}
}
/**
* Estimated reported quality score based on combined data's individual q-reporteds and number of observations.
* The estimating occurs when collapsing counts across different reported qualities.
*/
// 测序仪给出的原始质量分数
double reportedQuality = 0.0;
/**
* The empirical quality for datums that have been collapsed together (by read group and reported quality, for example).
*
* This variable was historically a double, but {@link #bayesianEstimateOfEmpiricalQuality} has always returned an integer qual score.
* Thus the type has been changed to integer in February 2025 to highlight this implementation detail. It does not change the output.
*/
// 计算出来的真实质量分数
int empiricalQuality = 0;
/**
* Number of bases seen in total
*/
// 这个字段也用来判断当前datum的有效性只有到numObservations > 0时这个datum才有效因为如果为0说明这个datum都没有出现过
uint64_t numObservations = 0;
/**
* Number of bases seen that didn't match the reference
* (actually sum of the error weights - so not necessarily a whole number)
* Stored with an internal multiplier to keep it closer to the floating-point sweet spot and avoid numerical error
* (see https://github.com/broadinstitute/gatk/wiki/Numerical-errors ).
* However, the value of the multiplier influences the results.
* For example, you get different results for 1000.0 and 10000.0
* See MathUtilsUnitTest.testAddDoubles for a demonstration.
* The value of the MULTIPLIER that we found to give consistent results insensitive to sorting is 10000.0;
*/
double numMismatches = 0.0;
RecalDatum() {}
RecalDatum(const uint64_t _numObservations, const double _numMismatches, const uint8_t _reportedQuality) {
numObservations = _numObservations;
numMismatches = _numMismatches * MULTIPLIER;
reportedQuality = _reportedQuality;
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
}
inline void increment(const uint64_t incObservations, const double incMismatches) {
numObservations += incObservations;
numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that.
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
}
inline void increment(const uint64_t incObservations, const double incMismatches, int baseQuality) {
numObservations += incObservations;
numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that.
reportedQuality = baseQuality;
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
}
inline void increment(const RecalDatum& other) {
numObservations += other.numObservations;
numMismatches += other.numMismatches;
reportedQuality = other.reportedQuality;
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
}
/**
* Add in all of the data from other into this object, updating the reported quality from the expected
* error rate implied by the two reported qualities.
*
* For example (the only example?), this method is called when collapsing the counts across reported quality scores within
* the same read group.
*
* @param other RecalDatum to combine
*/
void combine(const RecalDatum& other) {
// this is the *expected* (or theoretical) number of errors given the reported qualities and the number of observations.
double expectedNumErrors = this->calcExpectedErrors() + other.calcExpectedErrors();
// increment the counts
increment(other.getNumObservations(), other.getNumMismatches());
// we use the theoretical count above to compute the "estimated" reported quality
// after combining two datums with different reported qualities.
reportedQuality = -10 * log10(expectedNumErrors / getNumObservations());
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
}
/**
* calculate the expected number of errors given the estimated Q reported and the number of observations
* in this datum.
*
* @return a positive (potentially fractional) estimate of the number of errors
*/
inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
inline uint64_t getNumObservations() const { return numObservations; }
inline double getReportedQuality() const { return reportedQuality; }
/**
* Computes the empirical quality of the datum, using the reported quality as the prior.
* @see #getEmpiricalQuality(double) below.
*/
double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
/**
* Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.
*/
double getEmpiricalQuality(const double priorQualityScore) {
if (empiricalQuality == UNINITIALIZED_EMPIRICAL_QUALITY) {
calcEmpiricalQuality(priorQualityScore);
}
return empiricalQuality;
}
/**
* Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
*/
void calcEmpiricalQuality(const double priorQualityScore) {
// smoothing is one error and one non-error observation
const uint64_t mismatches = (uint64_t)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; // TODO: why add 0.5?
const uint64_t observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
const int empiricalQual = bayesianEstimateOfEmpiricalQuality(observations, mismatches, priorQualityScore);
empiricalQuality = std::min(empiricalQual, (int)MAX_RECALIBRATED_Q_SCORE);
}
/**
* Compute the maximum a posteriori (MAP) estimate of the probability of sequencing error under the following model.
*
* Let
* X = number of sequencing errors,
* n = number of observations,
* theta = probability of sequencing error as a quality score,
* theta_rep = probability of sequencing error reported by the sequencing machine as a quality score.
*
* The prior and the likelihood are:
*
* P(theta|theta_rep) ~ Gaussian(theta - theta_rep| 0, 0.5) (Note this is done in log space)
* P(X|n, theta) ~ Binom(X|n,theta)
*
* Note the prior is equivalent to
*
* P(theta|theta_rep) ~ Gaussian(theta | theta_rep, 0.5)
*
* TODO: use beta prior to do away with the search.
*
* @param nObservations n in the model above.
* @param nErrors the observed number of sequencing errors.
* @param priorMeanQualityScore the prior quality score, often the reported quality score.
*
* @return phredScale quality score that maximizes the posterior probability.
*/
static int bayesianEstimateOfEmpiricalQuality(const uint64_t nObservations, const uint64_t nErrors, const double priorMeanQualityScore) {
const int numQualityScoreBins = (QualityUtils::MAX_REASONABLE_Q_SCORE + 1);
double logPosteriors[numQualityScoreBins];
for (int i = 0; i < numQualityScoreBins; ++i) {
logPosteriors[i] = getLogPrior(i, priorMeanQualityScore) + getLogBinomialLikelihood(i, nObservations, nErrors);
}
return MathUtils::maxElementIndex(logPosteriors, 0, numQualityScoreBins);
}
static double getLogPrior(const double qualityScore, const double priorQualityScore) {
const int difference = std::min(std::abs((int)(qualityScore - priorQualityScore)), (int)MAX_GATK_USABLE_Q_SCORE);
return logPriorCache[difference];
}
/**
* Given:
* - n, the number of observations,
* - k, the number of sequencing errors,
* - p, the probability of error, encoded as the quality score.
*
* Return the binomial probability Bin(k|n,p).
*
* The method handles the case when the counts of type long are higher than the maximum allowed integer value,
* Integer.MAX_VALUE = (2^31)-1 ~= 2*10^9, since the library we use for binomial probability expects integer input.
*
*/
static double getLogBinomialLikelihood(const double qualityScore, uint64_t nObservations, uint64_t nErrors) {
if (nObservations == 0)
return 0.0;
// the binomial code requires ints as input (because it does caching). This should theoretically be fine because
// there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow
// before casting down to an int.
if (nObservations > MAX_NUMBER_OF_OBSERVATIONS) {
// we need to decrease nErrors by the same fraction that we are decreasing nObservations
const double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations;
nErrors = std::round((double)nErrors * fraction);
nObservations = MAX_NUMBER_OF_OBSERVATIONS;
}
// this is just a straight binomial PDF
const double logLikelihood = MathUtils::logBinomialProbability((int)nObservations, (int)nErrors, QualityUtils::qualToErrorProb(qualityScore));
return (std::isinf(logLikelihood) || std::isnan(logLikelihood)) ? -DBL_MAX : logLikelihood;
}
};