8
8
from __future__ import division
9
9
from numpy import array, multiply, sum, zeros, size, shape, diag, dot, mean,\
10
sqrt, transpose, trace, argsort
10
sqrt, transpose, trace, argsort, newaxis, finfo, all
11
11
from numpy.random import seed, normal as random_gauss
12
12
from numpy.linalg import norm, svd
13
13
from operator import itemgetter
103
103
self.points = initial_pts
105
105
self.points = self._center(self.points)
107
108
self._calc_distances()
108
# sets self.dists, ordered according to self.order
109
109
# dists relates to points, not to input data
111
111
self._update_dhats()
112
# self.dhats are constrained to be monotonic
112
# dhats are constrained to be monotonic
114
114
self._calc_stress()
115
# self.stress is calculated from self.dists and self.dhats
115
# self.stress is calculated from dists and dhats
117
117
self.stresses = [self.stress]
118
118
# stress is the metric of badness of fit used in this code
151
151
# normalize the scaling, which should not change the stress
156
"""The dhats in order."""
157
# Probably not required, but here in case needed for backward
158
# compatibility. self._dhats is the full 2D array
159
return [self._dhats[i,j] for (i,j) in self.order]
163
"""The dists in order"""
164
# Probably not required, but here in case needed for backward
165
# compatibility. self._dists is the full 2D array
166
return [self._dists[i,j] for (i,j) in self.order]
155
168
def getPoints(self):
156
169
"""Returns (ordered in a list) the n points in k space
210
223
return array(points, 'd')
212
225
def _calc_distances(self):
213
"""Returns a list of pairwise distances, ordered by self.order
216
# cProfile indicates this call is the speed bottleneck
217
dists = [norm(self.points[i] - self.points[j]) for i, j in self.order]
219
self.dists = array(dists, 'd')
226
"""Update distances between the points"""
227
diffv = self.points[newaxis, :, :] - self.points[:, newaxis, :]
228
squared_dists = (diffv**2).sum(axis=-1)
229
self._dists = sqrt(squared_dists)
230
self._squared_dist_sums = squared_dists.sum(axis=-1)
222
232
def _update_dhats(self):
223
"""updates self.dhats based on self.dists data"""
224
dhats = self.dists.copy()
225
dhats = self._do_monotone_regression(dhats)
226
self.dhats = array(dhats, 'd')
233
"""Update dhats based on distances"""
234
new_dhats = self._dists.copy()
235
ordered_dhats = [new_dhats[i,j] for (i,j) in self.order]
236
ordered_dhats = self._do_monotone_regression(ordered_dhats)
237
for ((i,j),d) in zip(self.order, ordered_dhats):
238
new_dhats[i,j] = new_dhats[j, i] = d
239
self._dhats = new_dhats
228
241
def _do_monotone_regression(self, dhats):
229
242
"""Performs a monotone regression on dhats, returning the result
236
249
if an element is smaller than its preceeding one, the two are averaged
237
250
and grouped together in a block. The process is repeated until
238
251
the blocks are monotonic, that is block i <= block i+1.
240
As the list gets long, it is likely that a deviation from monotinicity
241
would not propagate back to the beginning.
242
The current implementation restarts from the beginning
243
after fixing each non-monotonicity, potentially needlessly rechecking
244
many blocks. It could be rewritten to fix the entire list in one pass.
245
However, the _move_points step is more likely to be the
246
speed bottleneck here.
249
initial_len = len(dhats)
250
blocklist = [[dhat, 1] for dhat in dhats]
251
# each element is [value, blocksize]
255
if block_idx == (len(blocklist) - 1):
256
# we are at the last block => list is monotonic
257
are_blocks_monotonic = True
260
if blocklist[block_idx][0] > blocklist[block_idx+1][0]:
261
# fix non-monotonicity and restart from beginning of blocklist
262
b1_val = blocklist[block_idx][0]
263
b1_size = blocklist[block_idx][1]
264
b2_val = blocklist[block_idx+1][0]
265
b2_size = blocklist[block_idx+1][1]
266
blocklist[block_idx][0] = (b1_val*b1_size + b2_val*b2_size)\
268
blocklist[block_idx][1] = b1_size + b2_size
269
blocklist.pop(block_idx+1)
274
# remake dhats as a flat list, not a blocklist
255
for top_dhat in dhats:
258
while blocklist and top_dhat <= blocklist[-1][0]:
259
(dhat, total, size) = blocklist.pop()
262
top_dhat = top_total / top_size
263
blocklist.append((top_dhat, top_total, top_size))
275
264
result_dhats = []
276
for block in blocklist:
277
for j in range(block[1]):
278
result_dhats.append(block[0])
280
if len(result_dhats) != initial_len:
281
raise RuntimeError("monotone regression changed list size")
265
for (val, total, size) in blocklist:
266
result_dhats.extend([val]*size)
283
267
return result_dhats
285
269
def _calc_stress(self):
286
"""calculates the stress, or badness of fit from self.dhats, self.dists
270
"""calculates the stress, or badness of fit between the distances and dhats
271
Caches some intermediate values for gradient calculations.
288
diff = self.dists - self.dhats
289
top = sum(multiply(diff, diff))
290
self.stress = sqrt(top/sum(multiply(self.dists, self.dists)))
273
diffs = (self._dists - self._dhats)
275
self._squared_diff_sums = diffs.sum(axis=-1)
276
self._total_squared_diff = self._squared_diff_sums.sum() / 2
277
self._total_squared_dist = self._squared_dist_sums.sum() / 2
278
self.stress = sqrt(self._total_squared_diff/self._total_squared_dist)
280
def _nudged_stress(self, v, d, epsilon):
281
"""Calculates the stress with point v moved epsilon in the dth dimension
283
delta_epsilon = zeros([self.dimension], float)
284
delta_epsilon[d] = epsilon
285
moved_point = self.points[v] + delta_epsilon
286
squared_dists = ((moved_point - self.points)**2).sum(axis=-1)
287
squared_dists[v] = 0.0
288
delta_squared_dist = squared_dists.sum() - self._squared_dist_sums[v]
289
diffs = sqrt(squared_dists) - self._dhats[v]
291
delta_squared_diff = diffs.sum() - self._squared_diff_sums[v]
293
(self._total_squared_diff + delta_squared_diff) /
294
(self._total_squared_dist + delta_squared_dist))
292
296
def _rescale(self):
293
297
""" assumes centered, rescales to mean ot-origin dist of 1
393
397
a special function for use with external optimization routines.
394
398
pts here is a 1D numpy array"""
395
numrows, numcols = shape(self.points)
396
self.points = pts.reshape((numrows, numcols))
397
self._calc_distances()
399
pts = pts.reshape(self.points.shape)
401
changed = not all(pts == self.points)
404
self._calc_distances()
398
405
self._calc_stress()
399
406
return self.stress
408
def _calc_stress_gradients(self, pts):
409
"""Approx first derivatives of stress at pts, for optimisers"""
410
epsilon = sqrt(finfo(float).eps)
411
f0 = self._recalc_stress_from_pts(pts)
412
grad = zeros(pts.shape, float)
413
for k in range(len(pts)):
414
(point, dim) = divmod(k, self.dimension)
415
f1 = self._nudged_stress(point, dim, epsilon)
416
grad[k] = (f1 - f0)/epsilon
402
420
def metaNMDS(iters, *args, **kwargs):
403
421
""" runs NMDS, first with pcoa init, then iters times with random init