2
# Last Change: Tue Jul 17 11:00 PM 2007 J
4
"""Module implementing GM, a class which represents Gaussian mixtures.
6
GM instances can be used to create, sample mixtures. They also provide
7
different plotting facilities, such as isodensity contour for multi dimensional
8
models, ellipses of confidence."""
10
__docformat__ = 'restructuredtext'
13
from numpy.random import randn, rand
14
import numpy.linalg as lin
18
# Right now, two main usages of a Gaussian Model are possible
19
# - init a Gaussian Model with meta-parameters, and trains it
20
# - set-up a Gaussian Model to sample it, draw ellipsoides
21
# of confidences. In this case, we would like to init it with
22
# known values of parameters. This can be done with the class method
26
# - change bounds methods of GM class instanciations so that it cannot
27
# be used as long as w, mu and va are not set
28
# - We have to use scipy now for chisquare pdf, so there may be other
29
# methods to be used, ie for implementing random index.
30
# - there is no check on internal state of the GM, that is does w, mu and va
31
# values make sense (eg singular values) - plot1d is still very rhough. There
32
# should be a sensible way to modify the result plot (maybe returns a dic
33
# with global pdf, component pdf and fill matplotlib handles). Should be
35
class GmParamError(Exception):
36
"""Exception raised for errors in gmm params
39
expression -- input expression in which the error occurred
40
message -- explanation of the error
42
def __init__(self, message):
43
Exception.__init__(self)
44
self.message = message
50
"""Gaussian Mixture class. This is a simple container class
51
to hold Gaussian Mixture parameters (weights, mean, etc...).
52
It can also draw itself (confidence ellipses) and samples itself.
55
# I am not sure it is useful to have a spherical mode...
56
_cov_mod = ['diag', 'full']
58
#===============================
59
# Methods to construct a mixture
60
#===============================
61
def __init__(self, d, k, mode = 'diag'):
62
"""Init a Gaussian Mixture.
66
dimension of the mixture.
68
number of component in the mixture.
78
Only full and diag mode are supported for now.
81
If you want to build a Gaussian Mixture with knowns weights, means
82
and variances, you can use GM.fromvalues method directly"""
83
if mode not in self._cov_mod:
84
raise GmParamError("mode %s not recognized" + str(mode))
90
# Init to 0 all parameters, with the right dimensions.
91
# Not sure this is useful in python from an efficiency POV ?
93
self.mu = N.zeros((k, d))
95
self.va = N.zeros((k, d))
97
self.va = N.zeros((k * d, d))
99
self.__is_valid = False
105
def set_param(self, weights, mu, sigma):
106
"""Set parameters of the model.
108
Args should be conformant with metparameters d and k given during
113
weights of the mixture (k elements)
115
means of the mixture. One component's mean per row, k row for k
118
variances of the mixture. For diagonal models, one row contains
119
the diagonal elements of the covariance matrix. For full
120
covariance, d rows for one variance.
124
Create a 3 component, 2 dimension mixture with full covariance matrices
126
>>> w = numpy.array([0.2, 0.5, 0.3])
127
>>> mu = numpy.array([[0., 0.], [1., 1.]])
128
>>> va = numpy.array([[1., 0.], [0., 1.], [2., 0.5], [0.5, 1]])
129
>>> gm = GM(2, 3, 'full')
130
>>> gm.set_param(w, mu, va)
133
If you know already the parameters when creating the model, you can
134
simply use the method class GM.fromvalues."""
135
#XXX: when fromvalues is called, parameters are called twice...
136
k, d, mode = check_gmm_param(weights, mu, sigma)
138
raise GmParamError("Number of given components is %d, expected %d"
141
raise GmParamError("Dimension of the given model is %d, "\
142
"expected %d" % (d, self.d))
143
if not mode == self.mode and not d == 1:
144
raise GmParamError("Given covariance mode is %s, expected %s"
150
self.__is_valid = True
153
def fromvalues(cls, weights, mu, sigma):
154
"""This class method can be used to create a GM model
155
directly from its parameters weights, mean and variance
159
weights of the mixture (k elements)
161
means of the mixture. One component's mean per row, k row for k
164
variances of the mixture. For diagonal models, one row contains
165
the diagonal elements of the covariance matrix. For full
166
covariance, d rows for one variance.
175
>>> w, mu, va = GM.gen_param(d, k)
177
>>> gm.set_param(w, mu, va)
181
>>> w, mu, va = GM.gen_param(d, k)
182
>>> gm = GM.fromvalue(w, mu, va)
184
are strictly equivalent."""
185
k, d, mode = check_gmm_param(weights, mu, sigma)
186
res = cls(d, k, mode)
187
res.set_param(weights, mu, sigma)
190
#=====================================================
191
# Fundamental facilities (sampling, confidence, etc..)
192
#=====================================================
193
def sample(self, nframes):
194
""" Sample nframes frames from the model.
198
number of samples to draw.
202
samples in the format one sample per row (nframes, d)."""
203
if not self.__is_valid:
204
raise GmParamError("""Parameters of the model has not been
205
set yet, please set them using self.set_param()""")
207
# State index (ie hidden var)
208
sti = gen_rand_index(self.w, nframes)
209
# standard gaussian samples
210
x = randn(nframes, self.d)
212
if self.mode == 'diag':
213
x = self.mu[sti, :] + x * N.sqrt(self.va[sti, :])
214
elif self.mode == 'full':
216
cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
217
for i in range(self.k):
218
# Using cholesky looks more stable than sqrtm; sqrtm is not
219
# available in numpy anyway, only in scipy...
220
cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d, :])
222
for s in range(self.k):
223
tmpind = N.where(sti == s)[0]
224
x[tmpind] = N.dot(x[tmpind], cho[s].T) + self.mu[s]
226
raise GmParamError("cov matrix mode not recognized, "\
231
def conf_ellipses(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP,
232
level = misc.DEF_LEVEL):
233
"""Returns a list of confidence ellipsoids describing the Gmm
234
defined by mu and va. Check densities.gauss_ell for details
238
sequences of two integers which represent the dimensions where to
239
project the ellipsoid.
241
number of points to generate for the ellipse.
243
level of confidence (between 0 and 1).
247
a list of x coordinates for the ellipses (Xe[i] is the array
248
containing x coordinates of the ith Gaussian)
250
a list of y coordinates for the ellipses.
254
Suppose we have w, mu and va as parameters for a mixture, then:
257
>>> gm.set_param(w, mu, va)
258
>>> X = gm.sample(1000)
259
>>> Xe, Ye = gm.conf_ellipsoids()
260
>>> pylab.plot(X[:,0], X[:, 1], '.')
262
... pylab.plot(Xe[k], Ye[k], 'r')
264
Will plot samples X draw from the mixture model, and
265
plot the ellipses of equi-probability from the mean with
266
default level of confidence."""
268
raise ValueError("This function does not make sense for 1d "
271
if not self.__is_valid:
272
raise GmParamError("""Parameters of the model has not been
273
set yet, please set them using self.set_param()""")
277
if self.mode == 'diag':
278
for i in range(self.k):
279
x, y = D.gauss_ell(self.mu[i, :], self.va[i, :],
283
elif self.mode == 'full':
284
for i in range(self.k):
285
x, y = D.gauss_ell(self.mu[i, :],
286
self.va[i*self.d:i*self.d+self.d, :],
293
def check_state(self):
294
"""Returns true if the parameters of the model are valid.
296
For Gaussian mixtures, this means weights summing to 1, and variances
297
to be positive definite.
299
if not self.__is_valid:
300
raise GmParamError("Parameters of the model has not been"\
301
"set yet, please set them using self.set_param()")
303
# Check condition number for cov matrix
304
if self.mode == 'diag':
305
tinfo = N.finfo(self.va.dtype)
306
if N.any(self.va < tinfo.eps):
307
raise GmParamError("variances are singular")
308
elif self.mode == 'full':
311
for i in range(self.k):
312
N.linalg.cholesky(self.va[i*d:i*d+d, :])
313
except N.linalg.LinAlgError:
314
raise GmParamError("matrix %d is singular " % i)
317
raise GmParamError("Unknown mode")
322
def gen_param(cls, d, nc, mode = 'diag', spread = 1):
323
"""Generate random, valid parameters for a gaussian mixture model.
329
the number of components
331
covariance matrix mode ('full' or 'diag').
335
weights of the mixture
339
variances of the mixture
343
This is a class method.
348
mu = spread * N.sqrt(d) * randn(nc, d)
350
va = N.abs(randn(nc, d))
352
# If A is invertible, A'A is positive definite
353
va = randn(nc * d, d)
355
va[k*d:k*d+d] = N.dot( va[k*d:k*d+d],
356
va[k*d:k*d+d].transpose())
358
raise GmParamError('cov matrix mode not recognized')
362
#gen_param = classmethod(gen_param)
364
def pdf(self, x, log = False):
365
"""Computes the pdf of the model at given points.
369
points where to estimate the pdf. One row for one
370
multi-dimensional sample (eg to estimate the pdf at 100
371
different points in 10 dimension, data's shape should be (100,
374
If true, returns the log pdf instead of the pdf.
378
the pdf at points x."""
381
D.multiple_gauss_den(x, self.mu, self.va, log = True)
384
return N.sum(D.multiple_gauss_den(x, self.mu, self.va) * self.w, 1)
386
def pdf_comp(self, x, cid, log = False):
387
"""Computes the pdf of the model at given points, at given component.
391
points where to estimate the pdf. One row for one
392
multi-dimensional sample (eg to estimate the pdf at 100
393
different points in 10 dimension, data's shape should be (100,
398
If true, returns the log pdf instead of the pdf.
402
the pdf at points x."""
403
if self.mode == 'diag':
405
elif self.mode == 'full':
406
va = self.va[cid*self.d:(cid+1)*self.d]
408
raise GmParamError("""var mode %s not supported""" % self.mode)
411
return D.gauss_den(x, self.mu[cid], va, log = True) \
414
return D.multiple_gauss_den(x, self.mu[cid], va) * self.w[cid]
419
def plot(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP,
420
level = misc.DEF_LEVEL):
421
"""Plot the ellipsoides directly for the model
423
Returns a list of lines handle, so that their style can be modified. By
424
default, the style is red color, and nolegend for all of them.
428
sequence of two integers, the dimensions of interest.
430
Number of points to use for the ellipsoids.
432
level of confidence (to use with fill argument)
436
Returns a list of lines handle so that their properties
437
can be modified (eg color, label, etc...):
441
Does not work for 1d. Requires matplotlib
444
conf_ellipses is used to compute the ellipses. Use this if you want
445
to plot with something else than matplotlib."""
447
raise ValueError("This function does not make sense for 1d "
450
if not self.__is_valid:
451
raise GmParamError("""Parameters of the model has not been
452
set yet, please set them using self.set_param()""")
455
xe, ye = self.conf_ellipses(dim, npoints, level)
458
return [P.plot(xe[i], ye[i], 'r', label='_nolegend_')[0] for i in
461
raise GmParamError("matplotlib not found, cannot plot...")
463
def plot1d(self, level = misc.DEF_LEVEL, fill = False, gpdf = False):
464
"""Plots the pdf of each component of the 1d mixture.
468
level of confidence (to use with fill argument)
470
if True, the area of the pdf corresponding to the given
471
confidence intervales is filled.
473
if True, the global pdf is plot.
477
Returns a dictionary h of plot handles so that their properties
478
can be modified (eg color, label, etc...):
479
- h['pdf'] is a list of lines, one line per component pdf
480
- h['gpdf'] is the line for the global pdf
481
- h['conf'] is a list of filling area
484
raise ValueError("This function does not make sense for "\
485
"mixtures which are not unidimensional")
487
from scipy.stats import norm
488
pval = N.sqrt(self.va[:, 0]) * norm(0, 1).ppf((1+level)/2)
490
# Compute reasonable min/max for the normal pdf: [-mc * std, mc * std]
491
# gives the range we are taking in account for each gaussian
493
std = N.sqrt(self.va[:, 0])
494
mi = N.amin(self.mu[:, 0] - mc * std)
495
ma = N.amax(self.mu[:, 0] + mc * std)
498
x = N.linspace(mi, ma, np)
499
# Prepare the dic of plot handles to return
500
ks = ['pdf', 'conf', 'gpdf']
501
hp = dict((i, []) for i in ks)
503
# Compute the densities
504
y = D.multiple_gauss_den(x[:, N.newaxis], self.mu, self.va, \
507
yt = self.pdf(x[:, N.newaxis])
511
for c in range(self.k):
512
h = P.plot(x, N.exp(y[:, c]), 'r', label ='_nolegend_')
515
# Compute x coordinates of filled area
516
id1 = -pval[c] + self.mu[c]
517
id2 = pval[c] + self.mu[c]
518
xc = x[:, N.where(x>id1)[0]]
519
xc = xc[:, N.where(xc<id2)[0]]
521
# Compute the graph for filling
522
yf = self.pdf_comp(xc, c)
523
xc = N.concatenate(([xc[0]], xc, [xc[-1]]))
524
yf = N.concatenate(([0], yf, [0]))
525
h = P.fill(xc, yf, facecolor = 'b', alpha = 0.1,
529
h = P.plot(x, yt, 'r:', label='_nolegend_')
533
raise GmParamError("matplotlib not found, cannot plot...")
535
def density_on_grid(self, dim = misc.DEF_VIS_DIM, nx = 50, ny = 50,
536
nl = 20, maxlevel = 0.95, v = None):
537
"""Do all the necessary computation for contour plot of mixture's
542
sequence of two integers, the dimensions of interest.
544
Number of points to use for the x axis of the grid
546
Number of points to use for the y axis of the grid
548
Number of contour to plot.
552
points of the x axis of the grid
554
points of the y axis of the grid
556
values of the density on X and Y
558
Contour values to display.
562
X, Y, Z and V are as expected by matplotlib contour function."""
564
raise ValueError("This function does not make sense for 1d "
567
# Ok, it is a bit gory. Basically, we want to compute the size of the
568
# grid. We use conf_ellipse, which will return a couple of points for
569
# each component, and we can find a grid size which then is just big
570
# enough to contain all ellipses. This won't work well if two
571
# ellipsoids are crossing each other a lot (because this assumes that
572
# at a given point, one component is largely dominant for its
573
# contribution to the pdf).
575
xe, ye = self.conf_ellipses(level = maxlevel, dim = dim)
576
ax = [N.min(xe), N.max(xe), N.min(ye), N.max(ye)]
580
x, y, lden = self._densityctr(N.linspace(ax[0]-0.2*w,
582
N.linspace(ax[2]-0.2*h,
585
# XXX: how to find "good" values for level ?
587
v = N.linspace(-5, N.max(lden), nl)
588
return x, y, lden, N.array(v)
590
def _densityctr(self, rangex, rangey, dim = misc.DEF_VIS_DIM):
591
"""Helper function to compute density contours on a grid."""
592
gr = N.meshgrid(rangex, rangey)
595
xdata = N.concatenate((x[:, N.newaxis], y[:, N.newaxis]), axis = 1)
596
dmu = self.mu[:, dim]
597
dva = self._get_va(dim)
598
den = GM.fromvalues(self.w, dmu, dva).pdf(xdata, log = True)
599
den = den.reshape(len(rangey), len(rangex))
601
return gr[0], gr[1], den
603
def _get_va(self, dim):
604
"""Returns variance limited do 2 dimension in tuple dim."""
607
if dim.any() < 0 or dim.any() >= self.d:
608
raise ValueError("dim elements should be between 0 and dimension"
611
if self.mode == 'diag':
612
return self.va[:, dim]
613
elif self.mode == 'full':
615
vaselid = N.empty((ld * self.k, ld), N.int)
616
for i in range(self.k):
617
vaselid[ld*i] = dim[0] + i * self.d
618
vaselid[ld*i+1] = dim[1] + i * self.d
619
vadid = N.empty((ld * self.k, ld), N.int)
620
for i in range(self.k):
623
return self.va[vaselid, vadid]
625
raise ValueError("Unkown mode")
630
msg += "Gaussian Mixture:\n"
631
msg += " -> %d dimensions\n" % self.d
632
msg += " -> %d components\n" % self.k
633
msg += " -> %s covariance \n" % self.mode
635
msg += "Has initial values"""
637
msg += "Has no initial values yet"""
641
return self.__repr__()
643
# Function to generate a random index: this is kept outside any class,
644
# as the function can be useful for other
645
def gen_rand_index(p, n):
646
"""Generate a N samples vector containing random index between 1
647
and length(p), each index i with probability p(i)"""
648
# TODO Check args here
650
# TODO: check each value of inverse distribution is
654
index = N.zeros(n, dtype=int)
656
# This one should be a bit faster
657
for k in range(len(p)-1, 0, -1):
658
blop = N.where(N.logical_and(invcdf[k-1] <= uni,
664
def check_gmm_param(w, mu, va):
665
"""Check that w, mu and va are valid parameters for
666
a mixture of gaussian.
668
w should sum to 1, there should be the same number of component in each
669
param, the variances should be positive definite, etc...
673
vector or list of weigths of the mixture (K elements)
677
list of variances (vector K * d or square matrices Kd * d)
685
'diag' if diagonal covariance, 'full' of full matrices
688
# Check that w is valid
689
if not len(w.shape) == 1:
690
raise GmParamError('weight should be a rank 1 array')
692
if N.fabs(N.sum(w) - 1) > misc.MAX_DBL_DEV:
693
raise GmParamError('weight does not sum to 1')
695
# Check that mean and va have the same number of components
699
msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
700
raise GmParamError(msg)
702
msg = """va should be a K,d / K *d, d matrix, and a row vector if
704
raise GmParamError(msg)
710
msg = "not same number of component in mean and weights"
711
raise GmParamError(msg)
714
msg = "not same number of dimensions in mean and variances"
715
raise GmParamError(msg)
722
msg = "not same number of dimensions in mean and variances"
723
raise GmParamError(msg)
727
if __name__ == '__main__':