2025-12-28 14:33:45 +08:00
|
|
|
|
/*
|
|
|
|
|
|
Description: bqsr计算过程中需要记录的一些数据
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2025/12/24
|
|
|
|
|
|
*/
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
#include <float.h>
|
|
|
|
|
|
#include "qual_utils.h"
|
|
|
|
|
|
#include "util/math/normal_dist.h"
|
|
|
|
|
|
#include "util/math/math_utils.h"
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* The container for the 4-tuple
|
|
|
|
|
|
*
|
|
|
|
|
|
* ( reported quality, empirical quality, num observations, num mismatches/errors )
|
|
|
|
|
|
*
|
|
|
|
|
|
* for a given set of covariates.
|
|
|
|
|
|
*
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
struct RecalDatum {
|
|
|
|
|
|
static constexpr uint8_t MAX_RECALIBRATED_Q_SCORE = 93; // SAMUtils.MAX_PHRED_SCORE;
|
|
|
|
|
|
static constexpr int UNINITIALIZED_EMPIRICAL_QUALITY = -1;
|
|
|
|
|
|
static constexpr double MULTIPLIER = 100000.0; // See discussion in numMismatches about what the multiplier is.
|
|
|
|
|
|
/**
|
|
|
|
|
|
* used when calculating empirical qualities to avoid division by zero
|
|
|
|
|
|
*/
|
|
|
|
|
|
static constexpr int SMOOTHING_CONSTANT = 1;
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr uint64_t MAX_NUMBER_OF_OBSERVATIONS = INT_MAX - 1;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Quals above this value should be capped down to this value (because they are too high)
|
|
|
|
|
|
* in the base quality score recalibrator
|
|
|
|
|
|
*/
|
|
|
|
|
|
static constexpr uint8_t MAX_GATK_USABLE_Q_SCORE = 40;
|
|
|
|
|
|
|
|
|
|
|
|
static double logPriorCache[MAX_GATK_USABLE_Q_SCORE + 1];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void StaticInit() {
|
|
|
|
|
|
// normal distribution describing P(empiricalQuality - reportedQuality). Its mean is zero because a priori we expect
|
|
|
|
|
|
// no systematic bias in the reported quality score
|
|
|
|
|
|
const double mean = 0.0;
|
|
|
|
|
|
const double sigma = 0.5; // with these parameters, deltas can shift at most ~20 Q points
|
|
|
|
|
|
const NormalDistribution gaussian(mean, sigma);
|
|
|
|
|
|
for (int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++) {
|
|
|
|
|
|
logPriorCache[i] = gaussian.logDensity(i);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Estimated reported quality score based on combined data's individual q-reporteds and number of observations.
|
|
|
|
|
|
* The estimating occurs when collapsing counts across different reported qualities.
|
|
|
|
|
|
*/
|
|
|
|
|
|
// 测序仪给出的原始质量分数
|
|
|
|
|
|
double reportedQuality = 0.0;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* The empirical quality for datums that have been collapsed together (by read group and reported quality, for example).
|
|
|
|
|
|
*
|
|
|
|
|
|
* This variable was historically a double, but {@link #bayesianEstimateOfEmpiricalQuality} has always returned an integer qual score.
|
|
|
|
|
|
* Thus the type has been changed to integer in February 2025 to highlight this implementation detail. It does not change the output.
|
|
|
|
|
|
*/
|
|
|
|
|
|
// 计算出来的真实质量分数
|
|
|
|
|
|
int empiricalQuality = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Number of bases seen in total
|
|
|
|
|
|
*/
|
|
|
|
|
|
// 这个字段也用来判断当前datum的有效性,只有到numObservations > 0时,这个datum才有效,因为如果为0,说明这个datum都没有出现过
|
|
|
|
|
|
uint64_t numObservations = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Number of bases seen that didn't match the reference
|
|
|
|
|
|
* (actually sum of the error weights - so not necessarily a whole number)
|
|
|
|
|
|
* Stored with an internal multiplier to keep it closer to the floating-point sweet spot and avoid numerical error
|
|
|
|
|
|
* (see https://github.com/broadinstitute/gatk/wiki/Numerical-errors ).
|
|
|
|
|
|
* However, the value of the multiplier influences the results.
|
|
|
|
|
|
* For example, you get different results for 1000.0 and 10000.0
|
|
|
|
|
|
* See MathUtilsUnitTest.testAddDoubles for a demonstration.
|
|
|
|
|
|
* The value of the MULTIPLIER that we found to give consistent results insensitive to sorting is 10000.0;
|
|
|
|
|
|
*/
|
|
|
|
|
|
double numMismatches = 0.0;
|
|
|
|
|
|
|
|
|
|
|
|
RecalDatum() {}
|
|
|
|
|
|
|
|
|
|
|
|
RecalDatum(const uint64_t _numObservations, const double _numMismatches, const uint8_t _reportedQuality) {
|
|
|
|
|
|
numObservations = _numObservations;
|
|
|
|
|
|
numMismatches = _numMismatches * MULTIPLIER;
|
|
|
|
|
|
reportedQuality = _reportedQuality;
|
|
|
|
|
|
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-30 01:21:13 +08:00
|
|
|
|
inline void increment(const uint64_t incObservations, const double incMismatches) {
|
2025-12-28 14:33:45 +08:00
|
|
|
|
numObservations += incObservations;
|
|
|
|
|
|
numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that.
|
|
|
|
|
|
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-30 01:21:13 +08:00
|
|
|
|
inline void increment(const uint64_t incObservations, const double incMismatches, int baseQuality) {
|
2025-12-28 14:33:45 +08:00
|
|
|
|
numObservations += incObservations;
|
|
|
|
|
|
numMismatches += (incMismatches * MULTIPLIER); // the multiplier used to avoid underflow, or something like that.
|
|
|
|
|
|
reportedQuality = baseQuality;
|
|
|
|
|
|
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-30 12:48:59 +08:00
|
|
|
|
inline void increment(const RecalDatum& other) {
|
|
|
|
|
|
numObservations += other.numObservations;
|
|
|
|
|
|
numMismatches += other.numMismatches;
|
|
|
|
|
|
reportedQuality = other.reportedQuality;
|
|
|
|
|
|
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-28 14:33:45 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Add in all of the data from other into this object, updating the reported quality from the expected
|
|
|
|
|
|
* error rate implied by the two reported qualities.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example (the only example?), this method is called when collapsing the counts across reported quality scores within
|
|
|
|
|
|
* the same read group.
|
|
|
|
|
|
*
|
|
|
|
|
|
* @param other RecalDatum to combine
|
|
|
|
|
|
*/
|
|
|
|
|
|
void combine(const RecalDatum& other) {
|
|
|
|
|
|
// this is the *expected* (or theoretical) number of errors given the reported qualities and the number of observations.
|
|
|
|
|
|
double expectedNumErrors = this->calcExpectedErrors() + other.calcExpectedErrors();
|
|
|
|
|
|
|
|
|
|
|
|
// increment the counts
|
|
|
|
|
|
increment(other.getNumObservations(), other.getNumMismatches());
|
|
|
|
|
|
|
|
|
|
|
|
// we use the theoretical count above to compute the "estimated" reported quality
|
|
|
|
|
|
// after combining two datums with different reported qualities.
|
|
|
|
|
|
reportedQuality = -10 * log10(expectedNumErrors / getNumObservations());
|
|
|
|
|
|
empiricalQuality = UNINITIALIZED_EMPIRICAL_QUALITY;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* calculate the expected number of errors given the estimated Q reported and the number of observations
|
|
|
|
|
|
* in this datum.
|
|
|
|
|
|
*
|
|
|
|
|
|
* @return a positive (potentially fractional) estimate of the number of errors
|
|
|
|
|
|
*/
|
|
|
|
|
|
inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
|
|
|
|
|
|
inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
|
|
|
|
|
|
inline uint64_t getNumObservations() const { return numObservations; }
|
|
|
|
|
|
inline double getReportedQuality() const { return reportedQuality; }
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Computes the empirical quality of the datum, using the reported quality as the prior.
|
|
|
|
|
|
* @see #getEmpiricalQuality(double) below.
|
|
|
|
|
|
*/
|
|
|
|
|
|
double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.
|
|
|
|
|
|
*/
|
|
|
|
|
|
double getEmpiricalQuality(const double priorQualityScore) {
|
|
|
|
|
|
if (empiricalQuality == UNINITIALIZED_EMPIRICAL_QUALITY) {
|
|
|
|
|
|
calcEmpiricalQuality(priorQualityScore);
|
|
|
|
|
|
}
|
|
|
|
|
|
return empiricalQuality;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
|
|
|
|
|
|
*/
|
|
|
|
|
|
void calcEmpiricalQuality(const double priorQualityScore) {
|
|
|
|
|
|
// smoothing is one error and one non-error observation
|
|
|
|
|
|
const uint64_t mismatches = (uint64_t)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; // TODO: why add 0.5?
|
|
|
|
|
|
const uint64_t observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
|
|
|
|
|
|
|
|
|
|
|
|
const int empiricalQual = bayesianEstimateOfEmpiricalQuality(observations, mismatches, priorQualityScore);
|
|
|
|
|
|
|
|
|
|
|
|
empiricalQuality = std::min(empiricalQual, (int)MAX_RECALIBRATED_Q_SCORE);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Compute the maximum a posteriori (MAP) estimate of the probability of sequencing error under the following model.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Let
|
|
|
|
|
|
* X = number of sequencing errors,
|
|
|
|
|
|
* n = number of observations,
|
|
|
|
|
|
* theta = probability of sequencing error as a quality score,
|
|
|
|
|
|
* theta_rep = probability of sequencing error reported by the sequencing machine as a quality score.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The prior and the likelihood are:
|
|
|
|
|
|
*
|
|
|
|
|
|
* P(theta|theta_rep) ~ Gaussian(theta - theta_rep| 0, 0.5) (Note this is done in log space)
|
|
|
|
|
|
* P(X|n, theta) ~ Binom(X|n,theta)
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note the prior is equivalent to
|
|
|
|
|
|
*
|
|
|
|
|
|
* P(theta|theta_rep) ~ Gaussian(theta | theta_rep, 0.5)
|
|
|
|
|
|
*
|
|
|
|
|
|
* TODO: use beta prior to do away with the search.
|
|
|
|
|
|
*
|
|
|
|
|
|
* @param nObservations n in the model above.
|
|
|
|
|
|
* @param nErrors the observed number of sequencing errors.
|
|
|
|
|
|
* @param priorMeanQualityScore the prior quality score, often the reported quality score.
|
|
|
|
|
|
*
|
|
|
|
|
|
* @return phredScale quality score that maximizes the posterior probability.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static int bayesianEstimateOfEmpiricalQuality(const uint64_t nObservations, const uint64_t nErrors, const double priorMeanQualityScore) {
|
|
|
|
|
|
const int numQualityScoreBins = (QualityUtils::MAX_REASONABLE_Q_SCORE + 1);
|
|
|
|
|
|
|
|
|
|
|
|
double logPosteriors[numQualityScoreBins];
|
|
|
|
|
|
for (int i = 0; i < numQualityScoreBins; ++i) {
|
|
|
|
|
|
logPosteriors[i] = getLogPrior(i, priorMeanQualityScore) + getLogBinomialLikelihood(i, nObservations, nErrors);
|
|
|
|
|
|
}
|
|
|
|
|
|
return MathUtils::maxElementIndex(logPosteriors, 0, numQualityScoreBins);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static double getLogPrior(const double qualityScore, const double priorQualityScore) {
|
|
|
|
|
|
const int difference = std::min(std::abs((int)(qualityScore - priorQualityScore)), (int)MAX_GATK_USABLE_Q_SCORE);
|
|
|
|
|
|
return logPriorCache[difference];
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Given:
|
|
|
|
|
|
* - n, the number of observations,
|
|
|
|
|
|
* - k, the number of sequencing errors,
|
|
|
|
|
|
* - p, the probability of error, encoded as the quality score.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Return the binomial probability Bin(k|n,p).
|
|
|
|
|
|
*
|
|
|
|
|
|
* The method handles the case when the counts of type long are higher than the maximum allowed integer value,
|
|
|
|
|
|
* Integer.MAX_VALUE = (2^31)-1 ~= 2*10^9, since the library we use for binomial probability expects integer input.
|
|
|
|
|
|
*
|
|
|
|
|
|
*/
|
|
|
|
|
|
static double getLogBinomialLikelihood(const double qualityScore, uint64_t nObservations, uint64_t nErrors) {
|
|
|
|
|
|
if (nObservations == 0)
|
|
|
|
|
|
return 0.0;
|
|
|
|
|
|
|
|
|
|
|
|
// the binomial code requires ints as input (because it does caching). This should theoretically be fine because
|
|
|
|
|
|
// there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow
|
|
|
|
|
|
// before casting down to an int.
|
|
|
|
|
|
if (nObservations > MAX_NUMBER_OF_OBSERVATIONS) {
|
|
|
|
|
|
// we need to decrease nErrors by the same fraction that we are decreasing nObservations
|
|
|
|
|
|
const double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations;
|
|
|
|
|
|
nErrors = std::round((double)nErrors * fraction);
|
|
|
|
|
|
nObservations = MAX_NUMBER_OF_OBSERVATIONS;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// this is just a straight binomial PDF
|
|
|
|
|
|
const double logLikelihood = MathUtils::logBinomialProbability((int)nObservations, (int)nErrors, QualityUtils::qualToErrorProb(qualityScore));
|
|
|
|
|
|
return (std::isinf(logLikelihood) || std::isnan(logLikelihood)) ? -DBL_MAX : logLikelihood;
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|