314 lines
11 KiB
C
314 lines
11 KiB
C
/* The MIT License
|
|
|
|
Copyright (C) 2010, 2013-2014, 2020 Genome Research Ltd.
|
|
Copyright (C) 2011 Attractive Chaos <attractor@live.co.uk>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
a copy of this software and associated documentation files (the
|
|
"Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be
|
|
included in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE.
|
|
*/
|
|
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
|
#include <config.h>
|
|
|
|
#include <math.h>
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include "htslib/kfunc.h"
|
|
|
|
/* Log gamma function
|
|
* \log{\Gamma(z)}
|
|
* AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
|
|
*/
|
|
double kf_lgamma(double z)
|
|
{
|
|
double x = 0;
|
|
x += 0.1659470187408462e-06 / (z+7);
|
|
x += 0.9934937113930748e-05 / (z+6);
|
|
x -= 0.1385710331296526 / (z+5);
|
|
x += 12.50734324009056 / (z+4);
|
|
x -= 176.6150291498386 / (z+3);
|
|
x += 771.3234287757674 / (z+2);
|
|
x -= 1259.139216722289 / (z+1);
|
|
x += 676.5203681218835 / z;
|
|
x += 0.9999999999995183;
|
|
return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
|
|
}
|
|
|
|
/* complementary error function
|
|
* \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
|
|
* AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
|
|
*/
|
|
double kf_erfc(double x)
|
|
{
|
|
const double p0 = 220.2068679123761;
|
|
const double p1 = 221.2135961699311;
|
|
const double p2 = 112.0792914978709;
|
|
const double p3 = 33.912866078383;
|
|
const double p4 = 6.37396220353165;
|
|
const double p5 = .7003830644436881;
|
|
const double p6 = .03526249659989109;
|
|
const double q0 = 440.4137358247522;
|
|
const double q1 = 793.8265125199484;
|
|
const double q2 = 637.3336333788311;
|
|
const double q3 = 296.5642487796737;
|
|
const double q4 = 86.78073220294608;
|
|
const double q5 = 16.06417757920695;
|
|
const double q6 = 1.755667163182642;
|
|
const double q7 = .08838834764831844;
|
|
double expntl, z, p;
|
|
z = fabs(x) * M_SQRT2;
|
|
if (z > 37.) return x > 0.? 0. : 2.;
|
|
expntl = exp(z * z * - .5);
|
|
if (z < 10. / M_SQRT2) // for small z
|
|
p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
|
|
/ (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
|
|
else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
|
|
return x > 0.? 2. * p : 2. * (1. - p);
|
|
}
|
|
|
|
/* The following computes regularized incomplete gamma functions.
|
|
* Formulas are taken from Wiki, with additional input from Numerical
|
|
* Recipes in C (for modified Lentz's algorithm) and AS245
|
|
* (http://lib.stat.cmu.edu/apstat/245).
|
|
*
|
|
* A good online calculator is available at:
|
|
*
|
|
* http://www.danielsoper.com/statcalc/calc23.aspx
|
|
*
|
|
* It calculates upper incomplete gamma function, which equals
|
|
* kf_gammaq(s,z)*tgamma(s).
|
|
*/
|
|
|
|
#define KF_GAMMA_EPS 1e-14
|
|
#define KF_TINY 1e-290
|
|
|
|
// regularized lower incomplete gamma function, by series expansion
|
|
static double _kf_gammap(double s, double z)
|
|
{
|
|
double sum, x;
|
|
int k;
|
|
for (k = 1, sum = x = 1.; k < 100; ++k) {
|
|
sum += (x *= z / (s + k));
|
|
if (x / sum < KF_GAMMA_EPS) break;
|
|
}
|
|
return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
|
|
}
|
|
// regularized upper incomplete gamma function, by continued fraction
|
|
static double _kf_gammaq(double s, double z)
|
|
{
|
|
int j;
|
|
double C, D, f;
|
|
f = 1. + z - s; C = f; D = 0.;
|
|
// Modified Lentz's algorithm for computing continued fraction
|
|
// See Numerical Recipes in C, 2nd edition, section 5.2
|
|
for (j = 1; j < 100; ++j) {
|
|
double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
|
|
D = b + a * D;
|
|
if (D < KF_TINY) D = KF_TINY;
|
|
C = b + a / C;
|
|
if (C < KF_TINY) C = KF_TINY;
|
|
D = 1. / D;
|
|
d = C * D;
|
|
f *= d;
|
|
if (fabs(d - 1.) < KF_GAMMA_EPS) break;
|
|
}
|
|
return exp(s * log(z) - z - kf_lgamma(s) - log(f));
|
|
}
|
|
|
|
double kf_gammap(double s, double z)
|
|
{
|
|
return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
|
|
}
|
|
|
|
double kf_gammaq(double s, double z)
|
|
{
|
|
return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
|
|
}
|
|
|
|
/* Regularized incomplete beta function. The method is taken from
|
|
* Numerical Recipe in C, 2nd edition, section 6.4. The following web
|
|
* page calculates the incomplete beta function, which equals
|
|
* kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
|
|
*
|
|
* http://www.danielsoper.com/statcalc/calc36.aspx
|
|
*/
|
|
static double kf_betai_aux(double a, double b, double x)
|
|
{
|
|
double C, D, f;
|
|
int j;
|
|
if (x == 0.) return 0.;
|
|
if (x == 1.) return 1.;
|
|
f = 1.; C = f; D = 0.;
|
|
// Modified Lentz's algorithm for computing continued fraction
|
|
for (j = 1; j < 200; ++j) {
|
|
double aa, d;
|
|
int m = j>>1;
|
|
aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
|
|
: m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
|
|
D = 1. + aa * D;
|
|
if (D < KF_TINY) D = KF_TINY;
|
|
C = 1. + aa / C;
|
|
if (C < KF_TINY) C = KF_TINY;
|
|
D = 1. / D;
|
|
d = C * D;
|
|
f *= d;
|
|
if (fabs(d - 1.) < KF_GAMMA_EPS) break;
|
|
}
|
|
return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
|
|
}
|
|
double kf_betai(double a, double b, double x)
|
|
{
|
|
return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
|
|
}
|
|
|
|
#ifdef KF_MAIN
|
|
#include <stdio.h>
|
|
int main(int argc, char *argv[])
|
|
{
|
|
double x = 5.5, y = 3;
|
|
double a, b;
|
|
printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
|
|
printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
|
|
a = 2; b = 2; x = 0.5;
|
|
printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
|
|
// log\binom{n}{k}
|
|
static double lbinom(int n, int k)
|
|
{
|
|
if (k == 0 || n == k) return 0;
|
|
return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
|
|
}
|
|
|
|
// n11 n12 | n1_
|
|
// n21 n22 | n2_
|
|
//-----------+----
|
|
// n_1 n_2 | n
|
|
|
|
// hypergeometric distribution
|
|
static double hypergeo(int n11, int n1_, int n_1, int n)
|
|
{
|
|
return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
|
|
}
|
|
|
|
typedef struct {
|
|
int n11, n1_, n_1, n;
|
|
double p;
|
|
} hgacc_t;
|
|
|
|
// incremental version of hypergenometric distribution
|
|
static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
|
|
{
|
|
if (n1_ || n_1 || n) {
|
|
aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
|
|
} else { // then only n11 changed; the rest fixed
|
|
if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
|
|
if (n11 == aux->n11 + 1) { // incremental
|
|
aux->p *= (double)(aux->n1_ - aux->n11) / n11
|
|
* (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
|
|
aux->n11 = n11;
|
|
return aux->p;
|
|
}
|
|
if (n11 == aux->n11 - 1) { // incremental
|
|
aux->p *= (double)aux->n11 / (aux->n1_ - n11)
|
|
* (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
|
|
aux->n11 = n11;
|
|
return aux->p;
|
|
}
|
|
}
|
|
aux->n11 = n11;
|
|
}
|
|
aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
|
|
return aux->p;
|
|
}
|
|
|
|
double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
|
|
{
|
|
int i, j, max, min;
|
|
double p, q, left, right;
|
|
hgacc_t aux;
|
|
int n1_, n_1, n;
|
|
|
|
n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
|
|
max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
|
|
min = n1_ + n_1 - n; // not sure why n11-n22 is used instead of min(n_1,n1_)
|
|
if (min < 0) min = 0; // min n11, for left tail
|
|
*two = *_left = *_right = 1.;
|
|
if (min == max) return 1.; // no need to do test
|
|
q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
|
|
|
|
if (q == 0.0) {
|
|
/*
|
|
If here, the calculated probablility is so small it can't be stored
|
|
in a double, which is possible when the table contains fairly large
|
|
numbers. If this happens, most of the calculation can be skipped
|
|
as 'left', 'right' and '*two' will be (to a good approximation) 0.0.
|
|
The returned values '*_left' and '*_right' depend on which side
|
|
of the hypergeometric PDF 'n11' sits. This can be found by
|
|
comparing with the mode of the distribution, the formula for which
|
|
can be found at:
|
|
https://en.wikipedia.org/wiki/Hypergeometric_distribution
|
|
Note that in the comparison we multiply through by the denominator
|
|
of the mode (n + 2) to avoid a division.
|
|
*/
|
|
if ((int64_t) n11 * ((int64_t) n + 2) < ((int64_t) n_1 + 1) * ((int64_t) n1_ + 1)) {
|
|
// Peak to right of n11, so probability will be lower for all
|
|
// of the region from min to n11 and higher for at least some
|
|
// of the region from n11 to max; hence abs(i-n11) will be 0,
|
|
// abs(j-n11) will be > 0 and:
|
|
*_left = 0.0; *_right = 1.0; *two = 0.0;
|
|
return 0.0;
|
|
} else {
|
|
// Peak to left of n11, so probability will be lower for all
|
|
// of the region from n11 to max and higher for at least some
|
|
// of the region from min to n11; hence abs(i-n11) will be > 0,
|
|
// abs(j-n11) will be 0 and:
|
|
*_left = 1.0; *_right = 0.0; *two = 0.0;
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
// left tail
|
|
p = hypergeo_acc(min, 0, 0, 0, &aux);
|
|
for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow
|
|
left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
|
|
--i;
|
|
if (p < 1.00000001 * q) left += p;
|
|
else --i;
|
|
// right tail
|
|
p = hypergeo_acc(max, 0, 0, 0, &aux);
|
|
for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow
|
|
right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
|
|
++j;
|
|
if (p < 1.00000001 * q) right += p;
|
|
else ++j;
|
|
// two-tail
|
|
*two = left + right;
|
|
if (*two > 1.) *two = 1.;
|
|
// adjust left and right
|
|
if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
|
|
else left = 1.0 - right + q;
|
|
*_left = left; *_right = right;
|
|
return q;
|
|
}
|