~sahana-eden-short-projects/sahana-eden/staging

Viewing changes to modules/s3/pyvttbl/stats/jsci.py

Committer: Fran Boon
Date: 2011-10-11 09:51:52 UTC
mfrom: (185.15.273 vita)
Revision ID: fran@aidiq.com-20111011095152-zmzxneuci14yti1w

merge nursix: mandatory last_name as conditional validator instead of DB constraint, Add pyvttbl (contingency table toolkit), S3Cube method handler

files added:
modules/s3/pyvttbl

modules/s3/pyvttbl/LICENSE.txt

modules/s3/pyvttbl/README.txt

modules/s3/pyvttbl/SimpleHTML.py

modules/s3/pyvttbl/__init__.py

modules/s3/pyvttbl/anova.py

modules/s3/pyvttbl/data_restructurer.py

modules/s3/pyvttbl/dictset.py

modules/s3/pyvttbl/guided_data_restructurer.py

modules/s3/pyvttbl/pystaggrelite3.py

modules/s3/pyvttbl/pyvttbl.py

modules/s3/pyvttbl/qsturng.py

modules/s3/pyvttbl/stats

modules/s3/pyvttbl/stats/__init__.py

modules/s3/pyvttbl/stats/jsci.py

modules/s3/pyvttbl/stats/ols.py

modules/s3/pyvttbl/stats/pstat.py

modules/s3/pyvttbl/stats/stats.py

modules/s3/pyvttbl/stats/stats_test.py

modules/s3/pyvttbl/test

modules/s3/pyvttbl/test/__init__.py

modules/s3/pyvttbl/test/error~subjectXtimeofdayXcourseXmodel_MISSING.csv

modules/s3/pyvttbl/test/suppression~subjectXgroupXageXcycleXphase.csv

modules/s3/pyvttbl/test/test__dictset.py

modules/s3/pyvttbl/test/test__pystaggrelite3.py

modules/s3/pyvttbl/test/test__pyvttbl.py

modules/s3/pyvttbl/test/test__pyvttbl_performance.py

modules/s3/pyvttbl/test/words~ageXcondition.csv

modules/s3/pyvttbl/test_stats.py

modules/s3/pyvttbl/texttable.py

modules/s3/pyvttbl/texttable_rl.py

modules/s3/s3cube.py

files modified:
VERSION

models/00_utils.py

models/04_pr.py

modules/s3/__init__.py

Show diffs side-by-side

added added

removed removed

modules/s3/pyvttbl/stats/jsci.py

#!/usr/bin/python

"""Basic statistics utility functions.

The implementation of Student's t distribution inverse CDF was ported to Python

from JSci. The parameters are set to only be accurate to approximately 5

decimal places.

The JSci port comes frist. "New" code is near the bottom.

JSci information:

http://jsci.sourceforge.net/

Original Author: Mark Hale

Original Licence: LGPL

"""

import math

# Relative machine precision.

EPS = 2.22e-16

# The smallest positive floating-point number such that 1/xminin is machine representable.

XMININ = 2.23e-308

# Square root of 2 * pi

SQRT2PI = 2.5066282746310005024157652848110452530069867406099

LOGSQRT2PI = math.log(SQRT2PI);

# Rough estimate of the fourth root of logGamma_xBig

lg_frtbig = 2.25e76

pnt68 = 0.6796875

# lower value = higher precision

PRECISION = 4.0*EPS

def betaFraction(x, p, q):

"""Evaluates of continued fraction part of incomplete beta function.

Based on an idea from Numerical Recipes (W.H. Press et al, 1992)."""

sum_pq = p + q

p_plus = p + 1.0

p_minus = p - 1.0

h = 1.0-sum_pq*x/p_plus;

if abs(h) < XMININ:

h = XMININ

h = 1.0/h

frac = h

m = 1

delta = 0.0

c = 1.0

while m <= MAX_ITERATIONS and abs(delta-1.0) > PRECISION:

m2 = 2*m

# even index for d

d=m*(q-m)*x/((p_minus+m2)*(p+m2))

h=1.0+d*h

if abs(h) < XMININ: h=XMININ

h=1.0/h;

c=1.0+d/c;

if abs(c) < XMININ: c=XMININ

frac *= h*c;

# odd index for d

d = -(p+m)*(sum_pq+m)*x/((p+m2)*(p_plus+m2))

h=1.0+d*h

if abs(h) < XMININ: h=XMININ;

h=1.0/h

c=1.0+d/c

if abs(c) < XMININ: c = XMININ

delta=h*c

frac *= delta

m += 1

return frac

# The largest argument for which <code>logGamma(x)</code> is representable in the machine.

LOG_GAMMA_X_MAX_VALUE = 2.55e305

# Log Gamma related constants

lg_d1 = -0.5772156649015328605195174;

lg_d2 = 0.4227843350984671393993777;

lg_d4 = 1.791759469228055000094023;

lg_p1 = [ 4.945235359296727046734888,

201.8112620856775083915565, 2290.838373831346393026739,

11319.67205903380828685045, 28557.24635671635335736389,

38484.96228443793359990269, 26377.48787624195437963534,

7225.813979700288197698961 ]

lg_q1 = [ 67.48212550303777196073036,

1113.332393857199323513008, 7738.757056935398733233834,

27639.87074403340708898585, 54993.10206226157329794414,

61611.22180066002127833352, 36351.27591501940507276287,

8785.536302431013170870835 ]

lg_p2 = [ 4.974607845568932035012064,

542.4138599891070494101986, 15506.93864978364947665077,

184793.2904445632425417223, 1088204.76946882876749847,

3338152.967987029735917223, 5106661.678927352456275255,

3074109.054850539556250927 ]

lg_q2 = [ 183.0328399370592604055942,

7765.049321445005871323047, 133190.3827966074194402448,

1136705.821321969608938755, 5267964.117437946917577538,

13467014.54311101692290052, 17827365.30353274213975932,

9533095.591844353613395747 ]

lg_p4 = [ 14745.02166059939948905062,

100

2426813.369486704502836312, 121475557.4045093227939592,

101

2663432449.630976949898078, 29403789566.34553899906876,

102

170266573776.5398868392998, 492612579337.743088758812,

103

560625185622.3951465078242 ]

104

lg_q4 = [ 2690.530175870899333379843,

105

639388.5654300092398984238, 41355999.30241388052042842,

106

1120872109.61614794137657, 14886137286.78813811542398,

107

101680358627.2438228077304, 341747634550.7377132798597,

108

446315818741.9713286462081 ]

109

lg_c = [ -0.001910444077728,8.4171387781295e-4,

110

-5.952379913043012e-4, 7.93650793500350248e-4,

111

-0.002777777777777681622553, 0.08333333333333333331554247,

112

0.0057083835261 ]

113

def logGamma(x):

114

"""The natural logarithm of the gamma function.

115

Based on public domain NETLIB (Fortran) code by W. J. Cody and L. Stoltz

116

Applied Mathematics Division

117

Argonne National Laboratory

118

Argonne, IL 60439

119

120

References:

121

<OL>

122

<LI>W. J. Cody and K. E. Hillstrom, 'Chebyshev Approximations for the Natural Logarithm of the Gamma Function,' Math. Comp. 21, 1967, pp. 198-203.

123

<LI>K. E. Hillstrom, ANL/AMD Program ANLC366S, DGAMMA/DLGAMA, May, 1969.

124

<LI>Hart, Et. Al., Computer Approximations, Wiley and sons, New York, 1968.

125

</OL>

126

From the original documentation:

127

128

This routine calculates the LOG(GAMMA) function for a positive real argument X.

129

Computation is based on an algorithm outlined in references 1 and 2.

130

The program uses rational functions that theoretically approximate LOG(GAMMA)

131

to at least 18 significant decimal digits. The approximation for X > 12 is from reference 3,

132

while approximations for X < 12.0 are similar to those in reference 1, but are unpublished.

133

The accuracy achieved depends on the arithmetic system, the compiler, the intrinsic functions,

134

and proper selection of the machine-dependent constants.

135

136

Error returns:

137

The program returns the value XINF for X .LE. 0.0 or when overflow would occur.

138

The computation is believed to be free of underflow and overflow."""

139

140

y = x

141

if y < 0.0 or y > LOG_GAMMA_X_MAX_VALUE:

142

# Bad arguments

143

return float("inf")

144

145

if y <= EPS:

146

return -math.log(y)

147

148

if y <= 1.5:

149

if (y < pnt68):

150

corr = -math.log(y)

151

xm1 = y

152

else:

153

corr = 0.0;

154

xm1 = y - 1.0;

155

156

if y <= 0.5 or y >= pnt68:

157

xden = 1.0;

158

xnum = 0.0;

159

for i in xrange(8):

160

xnum = xnum * xm1 + lg_p1[i];

161

xden = xden * xm1 + lg_q1[i];

162

return corr + xm1 * (lg_d1 + xm1 * (xnum / xden));

163

else:

164

xm2 = y - 1.0;

165

xden = 1.0;

166

xnum = 0.0;

167

for i in xrange(8):

168

xnum = xnum * xm2 + lg_p2[i];

169

xden = xden * xm2 + lg_q2[i];

170

return corr + xm2 * (lg_d2 + xm2 * (xnum / xden));

171

172

if (y <= 4.0):

173

xm2 = y - 2.0;

174

xden = 1.0;

175

xnum = 0.0;

176

for i in xrange(8):

177

xnum = xnum * xm2 + lg_p2[i];

178

xden = xden * xm2 + lg_q2[i];

179

return xm2 * (lg_d2 + xm2 * (xnum / xden));

180

181

if y <= 12.0:

182

xm4 = y - 4.0;

183

xden = -1.0;

184

xnum = 0.0;

185

for i in xrange(8):

186

xnum = xnum * xm4 + lg_p4[i];

187

xden = xden * xm4 + lg_q4[i];

188

return lg_d4 + xm4 * (xnum / xden);

189

190

assert y <= lg_frtbig

191

res = lg_c[6];

192

ysq = y * y;

193

for i in xrange(6):

194

res = res / ysq + lg_c[i];

195

res /= y;

196

corr = math.log(y);

197

res = res + LOGSQRT2PI - 0.5 * corr;

198

res += y * (corr - 1.0);

199

return res

200

201

202

def logBeta(p, q):

203

"""The natural logarithm of the beta function."""

204

assert p > 0

205

assert q > 0

206

if p <= 0 or q <= 0 or p + q > LOG_GAMMA_X_MAX_VALUE:

207

return 0

208

209

return logGamma(p)+logGamma(q)-logGamma(p+q)

210

211

212

def incompleteBeta(x, p, q):

213

"""Incomplete beta function.

214

215

The computation is based on formulas from Numerical Recipes, Chapter 6.4 (W.H. Press et al, 1992).

216

Ported from Java: http://jsci.sourceforge.net/"""

217

218

assert 0 <= x <= 1

219

assert p > 0

220

assert q > 0

221

222

# Range checks to avoid numerical stability issues?

223

if x <= 0.0:

224

return 0.0

225

if x >= 1.0:

226

return 1.0

227

if p <= 0.0 or q <= 0.0 or (p+q) > LOG_GAMMA_X_MAX_VALUE:

228

return 0.0

229

230

beta_gam = math.exp(-logBeta(p,q) + p*math.log(x) + q*math.log(1.0-x))

231

if x < (p+1.0)/(p+q+2.0):

232

return beta_gam*betaFraction(x, p, q)/p

233

else:

234

return 1.0-(beta_gam*betaFraction(1.0-x,q,p)/q)

235

236

237

ACCURACY = 10**-7

238

MAX_ITERATIONS = 10000

239

def findRoot(value, x_low, x_high, function):

240

"""Use the bisection method to find root such that function(root) == value."""

241

242

guess = (x_high + x_low) / 2.0

243

v = function(guess)

244

difference = v - value

245

i = 0

246

while abs(difference) > ACCURACY and i < MAX_ITERATIONS:

247

i += 1

248

if difference > 0:

249

x_high = guess

250

else:

251

x_low = guess

252

253

guess = (x_high + x_low) / 2.0

254

v = function(guess)

255

difference = v - value

256

257

return guess

258

259

260

def StudentTCDF(degree_of_freedom, X):

261

"""Student's T distribution CDF. Returns probability that a value x < X.

262

263

Ported from Java: http://jsci.sourceforge.net/"""

264

265

A = 0.5 * incompleteBeta(degree_of_freedom/(degree_of_freedom+X*X), 0.5*degree_of_freedom, 0.5)

266

if X > 0:

267

return 1 - A

268

return A

269

270

271

def InverseStudentT(degree_of_freedom, probability):

272

"""Inverse of Student's T distribution CDF. Returns the value x such that CDF(x) = probability.

273

274

Ported from Java: http://jsci.sourceforge.net/

275

276

This is not the best algorithm in the world. SciPy has a Fortran version

277

(see special.stdtrit):

278

http://svn.scipy.org/svn/scipy/trunk/scipy/stats/distributions.py

279

http://svn.scipy.org/svn/scipy/trunk/scipy/special/cdflib/cdft.f

280

281

Very detailed information:

282

http://www.maths.ox.ac.uk/~shaww/finpapers/tdist.pdf

283

"""

284

285

assert 0 <= probability <= 1

286

287

if probability == 1:

288

return float("inf")

289

if probability == 0:

290

return float("-inf")

291

if probability == 0.5:

292

return 0.0

293

294

def f(x):

295

return StudentTCDF(degree_of_freedom, x)

296

297

return findRoot(probability, -10**4, 10**4, f)

298

299

300

def tinv(p, degree_of_freedom):

301

"""Similar to the TINV function in Excel

302

303

p: 1-confidence (eg. 0.05 = 95% confidence)"""

304

305

assert 0 <= p <= 1

306

confidence = 1 - p

307

return InverseStudentT(degree_of_freedom, (1+confidence)/2.0)

308

309

310

def memoize(function):

311

cache = {}

312

def closure(*args):

313

if args not in cache:

314

cache[args] = function(*args)

315

return cache[args]

316

return closure

317

318

# Cache tinv results, since we typically call it with the same args over and over

319

cached_tinv = memoize(tinv)

320

321

322

def stats(r, confidence_interval=0.05):

323

"""Returns statistics about a sequence of numbers.

324

325

By default it computes the 95% confidence interval.

326

327

Returns (average, median, standard deviation, min, max, confidence interval)"""

328

329

total = sum(r)

330

average = total/float(len(r))

331

sum_deviation_squared = sum([(i-average)**2 for i in r])

332

standard_deviation = math.sqrt(sum_deviation_squared/(len(r)-1 or 1))

333

s = list(r)

334

s.sort()

335

median = s[len(s)/2]

336

minimum = s[0]

337

maximum = s[-1]

338

# See: http://davidmlane.com/hyperstat/

339

# confidence_95 = 1.959963984540051 * standard_deviation / math.sqrt(len(r))

340

# We must estimate both using the t distribution:

341

# http://davidmlane.com/hyperstat/B7483.html

342

# s_m = s / sqrt(N)

343

s_m = standard_deviation / math.sqrt(len(r))

344

# Degrees of freedom = n-1

345

# t = tinv(0.05, degrees_of_freedom)

346

# confidence = +/- t * s_m

347

confidence = cached_tinv(confidence_interval, len(r)-1) * s_m

348

return average, median, standard_deviation, minimum, maximum, confidence

Older »