~vincent-vincentdavis/statsmodels/update_formatting

~vincent-vincentdavis/statsmodels/update_formatting_descstats

« back to all changes in this revision

Viewing changes to scikits/statsmodels/descstats.py

Committer: Vincent Davis
Date: 2010-04-18 02:27:53 UTC
Revision ID: vincent@vincentdavis.net-20100418022753-hyh9690iuqdwjwi3

added tutorial files, not complete

files added:
scikits/statsmodels/docs/source/statsmodelsXstata.py

scikits/statsmodels/docs/source/tutorial_basic.py

scikits/statsmodels/docs/source/tutorial_basic.rst

scikits/statsmodels/docs/testing.rst

files removed:
scikits/statsmodels/formatTable.py

files renamed:
scikits/statsmodels/sandbox/descstats.py => scikits/statsmodels/descstats.py

Show diffs side-by-side

added added

removed removed

scikits/statsmodels/descstats.py

def sign_test(samp,mu0=0):

'''

Signs test with mu0=0 by default (though

the median is often used in practice)

Parameters

----------

samp

mu0

Returns

---------

M, p-value

where

M=(N(+) - N(-))/2, N(+) is the number of values above Mu0,

N(-) is the number of values below. Values equal to Mu0

are discarded.

The p-value for M is calculated using the binomial distrubution

and can be intrepreted the same as for a t-test.

See Also

---------

scipy.stats.wilcoxon

'''

pos=np.sum(samp>mu0)

neg=np.sum(samp<mu0)

M=(pos-neg)/2.

p=stats.binom_test(min(pos,neg),pos+neg,.5)

return M, p

def descstats(data, cols=None, axis=0):

'''

Parameters

------------

data: numpy array

`x` is the data

v: list, optional

A list of the column number or field names (for a recarray) of variables.

Default is all columns.

A list of the column number or field names (for a recarray) of variables.

Default is all columns.

axis: 1 or 0

axis order of data. Default is 0 for column-ordered data.

Example

----------simple

x = np.array(data) # or rather, the data we're interested in

if cols is None:

# if isinstance(x, np.recarray):

# cols = np.array(len(x.dtype.names))

if isinstance(x, np.recarray):

cols = np.array(len(x.dtype.names))

if not isinstance(x, np.recarray) and x.ndim == 1:

x = x[:,None]

Variance %(variance)22.4g Sum Observations %(sobs)22.4g

Std. Dev. %(stddev)22.4g

''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \

stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \

'mean': x.mean(), 'median': np.median(x), 'range': \

'('+str(x.min())+', '+str(x.max())+')', 'variance': \

x.var(), 'stddev': x.std(), 'coeffvar': \

100

stats.variation(x), 'skewness': stats.skew(x), \

101

'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\

102

'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \

'mean': x.mean(), 'median': np.median(x), 'range': \

'('+str(x.min())+', '+str(x.max())+')', 'variance': \

x.var(), 'stddev': x.std(), 'coeffvar': \

100

stats.variation(x), 'skewness': stats.skew(x), \

101

'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\

102

'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

103

104

# ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \

105

# stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \

126

95 %% %12.4g

127

99 %% %12.4g

128

''' % tuple([stats.scoreatpercentile(x,per) for per in (1,5,10,25,

129

50,75,90,95,99)])

129

50,75,90,95,99)])

130

t,p_t=stats.ttest_1samp(x,0)

131

M,p_M=sign_test(x)

132

S,p_S=stats.wilcoxon(np.squeeze(x))

146

# in any event these should be split up, so that they can be called

147

# individually and only returned together if someone calls summary

148

# or something of the sort

149

150

elif x.shape[1] > 1:

151

desc ='''

152

Var. Name | Obs. Mean Std. Dev. Range

153

------------+--------------------------------------------------------'''+\

154

os.linesep

154

os.linesep

155

156

# for recarrays with columns passed as names

157

# if isinstance(cols[0],str):

162

# +str(x[var].max())+')'+os.linesep}

163

# else:

164

for var in range(x.shape[1]):

165

desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \

165

desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \

166

%(range)20s" % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(),

167

'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\

168

str(x[:,var].max())+')'+os.linesep}

170

raise ValueError, "data not understood"

171

172

return desc

173

174

#if __name__=='__main__':

175

# test descstats

176

# import os

177

# loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv'

178

# relpath=(load_dataset(loc))

179

# dta=np.recfromcsv(relpath)

180

# descstats(dta,['stpop'])

181

# raw_input('Hit enter for multivariate test')

182

# descstats(dta,['stpop','avginc','vio'])

173

174

##if __name__=='__main__':

175

## data = np.recfromcsv('datasets/anes96/anes96.csv', delimiter='\t')

176

## descstats(dta,['stpop'])

177

178

## #test descstats

179

## import os

180

## import datasets.load_dataset

181

## loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv'

182

## relpath=(load_dataset(loc))

183

## dta=np.recfromcsv(relpath)

184

## descstats(dta,['stpop'])

185

## raw_input('Hit enter for multivariate test')

186

## descstats(dta,['stpop','avginc','vio'])

183

187

184

188

# with plain arrays

185

189

# import string2dummy as s2d

187

191

# ndts=np.vstack(dts[col] for col in dts.dtype.names)

188

192

# observations in columns and data in rows

189

193

# is easier for the call to stats

190

194

191

195

# what to make of

192

196

# ndts=np.column_stack(dts[col] for col in dts.dtype.names)

193

197

# ntda=ntds.swapaxis(1,0)

207

211

data.exog = sm.add_constant(data.exog)

208

212

sum1 = descstats(data.exog)

209

213

sum1a = descstats(data.exog[:,:1])

210

214

211

215

# loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv'

212

216

# dta=np.recfromcsv(loc)

213

217

# summary2 = descstats(dta,['stpop'])

226

230

sum2 = descstats(data2.ahe)

227

231

sum3 = descstats(np.column_stack((data2.ahe,data2.yrseduc)))

228

232

sum4 = descstats(np.column_stack(([data2[_] for \

229

_ in data2.dtype.names])))

233

_ in data2.dtype.names])))

230

234

231

235

Older »