2
# Last Change: Mon Jul 02 04:00 PM 2007 J
4
__doc__ = """Example of using RegularizedEM with pendigits data.
6
If you want to do discriminant analysis with pendigits, you quickly have
7
problems with EM if you use directly the coordinates, because some points are
8
likely to be on the border, hence the corresponding component can have a
9
covariance matrix which easily becomes singular. Regularized EM avoids this
10
problem by using simple regularization on the mixture. You can play with pcount
11
and pval to see the effect on pdf estimation.
13
For now, regularized EM is pretty crude, but is enough for simple cases where
14
you need to avoid singular covariance matrices."""
19
#from scipy.sandbox import pyem
20
from gauss_mix import GM
21
from gmm_em import GMM, EM, RegularizedEM
24
x, y = utils.get_pendigits()
26
# Take only the first point of pendigits for pdf estimation
27
dt1 = N.concatenate([x[:, :1], y[:, :1]], 1)
28
dt1 = utils.scale(dt1.astype(N.float))
30
# pcnt is the poportion of samples to use as prior count. Eg if you have 1000
31
# samples, and pcn is 0.1, then the prior count would be 100, and 1100 samples
32
# will be considered as overall when regularizing the parameters.
34
# You should try different values of pval. If pval is 0.1, then the
35
# regularization will be strong. If you use something like 0.01, really sharp
36
# components will appear. If the values are too small, the regularizer may not
37
# work (singular covariance matrices).
40
# This function train a mixture model with k components, returns the trained
42
def cluster(data, k, mode = 'full'):
45
gmm = GMM(gm, 'random')
46
em = RegularizedEM(pcnt = pcnt, pval = pval)
47
em.train(data, gmm, maxiter = 20)
48
return gm, gmm.bic(data)
50
# bc will contain a list of BIC values for each model trained
51
N.seterr(all = 'warn')
57
# Train a model of k component, and plots isodensity curve
59
gm, b = cluster(dt1, k = k, mode = mode)
62
X, Y, Z, V = gm.density_on_grid(nl = 20)
64
P.plot(dt1[:, 0], dt1[:, 1], '.')
65
P.xlabel('x coordinate (scaled)')
66
P.ylabel('y coordinate (scaled)')
68
print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)