3
"""Basic statistics utility functions.
5
The implementation of Student's t distribution inverse CDF was ported to Python
6
from JSci. The parameters are set to only be accurate to approximately 5
9
The JSci port comes frist. "New" code is near the bottom.
12
http://jsci.sourceforge.net/
13
Original Author: Mark Hale
14
Original Licence: LGPL
19
# Relative machine precision.
21
# The smallest positive floating-point number such that 1/xminin is machine representable.
23
# Square root of 2 * pi
24
SQRT2PI = 2.5066282746310005024157652848110452530069867406099
25
LOGSQRT2PI = math.log(SQRT2PI);
26
# Rough estimate of the fourth root of logGamma_xBig
29
# lower value = higher precision
31
def betaFraction(x, p, q):
32
"""Evaluates of continued fraction part of incomplete beta function.
34
Based on an idea from Numerical Recipes (W.H. Press et al, 1992)."""
39
h = 1.0-sum_pq*x/p_plus;
48
while m <= MAX_ITERATIONS and abs(delta-1.0) > PRECISION:
52
d=m*(q-m)*x/((p_minus+m2)*(p+m2))
54
if abs(h) < XMININ: h=XMININ
57
if abs(c) < XMININ: c=XMININ
61
d = -(p+m)*(sum_pq+m)*x/((p+m2)*(p_plus+m2))
63
if abs(h) < XMININ: h=XMININ;
66
if abs(c) < XMININ: c = XMININ
73
# The largest argument for which <code>logGamma(x)</code> is representable in the machine.
74
LOG_GAMMA_X_MAX_VALUE = 2.55e305
75
# Log Gamma related constants
76
lg_d1 = -0.5772156649015328605195174;
77
lg_d2 = 0.4227843350984671393993777;
78
lg_d4 = 1.791759469228055000094023;
79
lg_p1 = [ 4.945235359296727046734888,
80
201.8112620856775083915565, 2290.838373831346393026739,
81
11319.67205903380828685045, 28557.24635671635335736389,
82
38484.96228443793359990269, 26377.48787624195437963534,
83
7225.813979700288197698961 ]
84
lg_q1 = [ 67.48212550303777196073036,
85
1113.332393857199323513008, 7738.757056935398733233834,
86
27639.87074403340708898585, 54993.10206226157329794414,
87
61611.22180066002127833352, 36351.27591501940507276287,
88
8785.536302431013170870835 ]
89
lg_p2 = [ 4.974607845568932035012064,
90
542.4138599891070494101986, 15506.93864978364947665077,
91
184793.2904445632425417223, 1088204.76946882876749847,
92
3338152.967987029735917223, 5106661.678927352456275255,
93
3074109.054850539556250927 ]
94
lg_q2 = [ 183.0328399370592604055942,
95
7765.049321445005871323047, 133190.3827966074194402448,
96
1136705.821321969608938755, 5267964.117437946917577538,
97
13467014.54311101692290052, 17827365.30353274213975932,
98
9533095.591844353613395747 ]
99
lg_p4 = [ 14745.02166059939948905062,
100
2426813.369486704502836312, 121475557.4045093227939592,
101
2663432449.630976949898078, 29403789566.34553899906876,
102
170266573776.5398868392998, 492612579337.743088758812,
103
560625185622.3951465078242 ]
104
lg_q4 = [ 2690.530175870899333379843,
105
639388.5654300092398984238, 41355999.30241388052042842,
106
1120872109.61614794137657, 14886137286.78813811542398,
107
101680358627.2438228077304, 341747634550.7377132798597,
108
446315818741.9713286462081 ]
109
lg_c = [ -0.001910444077728,8.4171387781295e-4,
110
-5.952379913043012e-4, 7.93650793500350248e-4,
111
-0.002777777777777681622553, 0.08333333333333333331554247,
114
"""The natural logarithm of the gamma function.
115
Based on public domain NETLIB (Fortran) code by W. J. Cody and L. Stoltz<BR>
116
Applied Mathematics Division<BR>
117
Argonne National Laboratory<BR>
118
Argonne, IL 60439<BR>
122
<LI>W. J. Cody and K. E. Hillstrom, 'Chebyshev Approximations for the Natural Logarithm of the Gamma Function,' Math. Comp. 21, 1967, pp. 198-203.
123
<LI>K. E. Hillstrom, ANL/AMD Program ANLC366S, DGAMMA/DLGAMA, May, 1969.
124
<LI>Hart, Et. Al., Computer Approximations, Wiley and sons, New York, 1968.
126
From the original documentation:
128
This routine calculates the LOG(GAMMA) function for a positive real argument X.
129
Computation is based on an algorithm outlined in references 1 and 2.
130
The program uses rational functions that theoretically approximate LOG(GAMMA)
131
to at least 18 significant decimal digits. The approximation for X > 12 is from reference 3,
132
while approximations for X < 12.0 are similar to those in reference 1, but are unpublished.
133
The accuracy achieved depends on the arithmetic system, the compiler, the intrinsic functions,
134
and proper selection of the machine-dependent constants.
137
The program returns the value XINF for X .LE. 0.0 or when overflow would occur.
138
The computation is believed to be free of underflow and overflow."""
141
if y < 0.0 or y > LOG_GAMMA_X_MAX_VALUE:
156
if y <= 0.5 or y >= pnt68:
160
xnum = xnum * xm1 + lg_p1[i];
161
xden = xden * xm1 + lg_q1[i];
162
return corr + xm1 * (lg_d1 + xm1 * (xnum / xden));
168
xnum = xnum * xm2 + lg_p2[i];
169
xden = xden * xm2 + lg_q2[i];
170
return corr + xm2 * (lg_d2 + xm2 * (xnum / xden));
177
xnum = xnum * xm2 + lg_p2[i];
178
xden = xden * xm2 + lg_q2[i];
179
return xm2 * (lg_d2 + xm2 * (xnum / xden));
186
xnum = xnum * xm4 + lg_p4[i];
187
xden = xden * xm4 + lg_q4[i];
188
return lg_d4 + xm4 * (xnum / xden);
190
assert y <= lg_frtbig
194
res = res / ysq + lg_c[i];
197
res = res + LOGSQRT2PI - 0.5 * corr;
198
res += y * (corr - 1.0);
203
"""The natural logarithm of the beta function."""
206
if p <= 0 or q <= 0 or p + q > LOG_GAMMA_X_MAX_VALUE:
209
return logGamma(p)+logGamma(q)-logGamma(p+q)
212
def incompleteBeta(x, p, q):
213
"""Incomplete beta function.
215
The computation is based on formulas from Numerical Recipes, Chapter 6.4 (W.H. Press et al, 1992).
216
Ported from Java: http://jsci.sourceforge.net/"""
222
# Range checks to avoid numerical stability issues?
227
if p <= 0.0 or q <= 0.0 or (p+q) > LOG_GAMMA_X_MAX_VALUE:
230
beta_gam = math.exp(-logBeta(p,q) + p*math.log(x) + q*math.log(1.0-x))
231
if x < (p+1.0)/(p+q+2.0):
232
return beta_gam*betaFraction(x, p, q)/p
234
return 1.0-(beta_gam*betaFraction(1.0-x,q,p)/q)
238
MAX_ITERATIONS = 10000
239
def findRoot(value, x_low, x_high, function):
240
"""Use the bisection method to find root such that function(root) == value."""
242
guess = (x_high + x_low) / 2.0
244
difference = v - value
246
while abs(difference) > ACCURACY and i < MAX_ITERATIONS:
253
guess = (x_high + x_low) / 2.0
255
difference = v - value
260
def StudentTCDF(degree_of_freedom, X):
261
"""Student's T distribution CDF. Returns probability that a value x < X.
263
Ported from Java: http://jsci.sourceforge.net/"""
265
A = 0.5 * incompleteBeta(degree_of_freedom/(degree_of_freedom+X*X), 0.5*degree_of_freedom, 0.5)
271
def InverseStudentT(degree_of_freedom, probability):
272
"""Inverse of Student's T distribution CDF. Returns the value x such that CDF(x) = probability.
274
Ported from Java: http://jsci.sourceforge.net/
276
This is not the best algorithm in the world. SciPy has a Fortran version
277
(see special.stdtrit):
278
http://svn.scipy.org/svn/scipy/trunk/scipy/stats/distributions.py
279
http://svn.scipy.org/svn/scipy/trunk/scipy/special/cdflib/cdft.f
281
Very detailed information:
282
http://www.maths.ox.ac.uk/~shaww/finpapers/tdist.pdf
285
assert 0 <= probability <= 1
291
if probability == 0.5:
295
return StudentTCDF(degree_of_freedom, x)
297
return findRoot(probability, -10**4, 10**4, f)
300
def tinv(p, degree_of_freedom):
301
"""Similar to the TINV function in Excel
303
p: 1-confidence (eg. 0.05 = 95% confidence)"""
307
return InverseStudentT(degree_of_freedom, (1+confidence)/2.0)
310
def memoize(function):
313
if args not in cache:
314
cache[args] = function(*args)
318
# Cache tinv results, since we typically call it with the same args over and over
319
cached_tinv = memoize(tinv)
322
def stats(r, confidence_interval=0.05):
323
"""Returns statistics about a sequence of numbers.
325
By default it computes the 95% confidence interval.
327
Returns (average, median, standard deviation, min, max, confidence interval)"""
330
average = total/float(len(r))
331
sum_deviation_squared = sum([(i-average)**2 for i in r])
332
standard_deviation = math.sqrt(sum_deviation_squared/(len(r)-1 or 1))
338
# See: http://davidmlane.com/hyperstat/
339
# confidence_95 = 1.959963984540051 * standard_deviation / math.sqrt(len(r))
340
# We must estimate both using the t distribution:
341
# http://davidmlane.com/hyperstat/B7483.html
343
s_m = standard_deviation / math.sqrt(len(r))
344
# Degrees of freedom = n-1
345
# t = tinv(0.05, degrees_of_freedom)
346
# confidence = +/- t * s_m
347
confidence = cached_tinv(confidence_interval, len(r)-1) * s_m
348
return average, median, standard_deviation, minimum, maximum, confidence